Sleipnir
src/seekdataset.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "seekdataset.h"
00023 #include "seekreader.h"
00024 #include "seekevaluate.h"
00025 
00026 namespace Sleipnir {
00027 
00028 CSeekDataset::CSeekDataset(){
00029     r = NULL;
00030     rData = NULL;
00031     dbMap = NULL;
00032     geneMap = NULL;
00033     queryMap = NULL;
00034     platform = NULL;
00035     geneAverage.clear();
00036     geneVariance.clear();
00037     genePresence.clear();
00038     query.clear();
00039     queryIndex.clear();
00040     iQuerySize = 0;
00041     iNumGenes = 0;
00042     iDBSize = 0;
00043     m_bIsNibble = false;
00044     m_fDsetAverage = CMeta::GetNaN();
00045     m_fDsetStdev = CMeta::GetNaN();
00046     weight.clear();
00047     sum_weight = -1;
00048 }
00049 
00050 CSeekDataset::~CSeekDataset(){
00051     DeleteQuery();
00052     DeleteQueryBlock();
00053     if(geneMap!=NULL){
00054         delete geneMap;
00055         geneMap = NULL;
00056         iNumGenes = 0;
00057     }
00058     geneAverage.clear();
00059     geneVariance.clear();
00060     genePresence.clear();
00061     m_fDsetAverage = CMeta::GetNaN();
00062     m_fDsetStdev = CMeta::GetNaN();
00063     platform = NULL;
00064 }
00065 
00066 //Copy CSeekDataset from a given object
00067 bool CSeekDataset::Copy(CSeekDataset *src){
00068     //copies the following data if available: geneAverage, genePresence
00069     //geneVariance, dsetAverage, dsetStdev, geneMap, platform
00070     
00071     if(src->geneAverage.size()>0){
00072         //fprintf(stderr, "Great a!\n");
00073         geneAverage.resize(src->geneAverage.size());
00074         utype i;
00075         for(i=0; i<src->geneAverage.size(); i++){
00076             geneAverage[i] = src->geneAverage[i];
00077         }
00078     }
00079     if(src->genePresence.size()>0){
00080         //fprintf(stderr, "Great b!\n");
00081         genePresence.resize(src->genePresence.size());
00082         utype i;
00083         for(i=0; i<src->genePresence.size(); i++){
00084             genePresence[i] = src->genePresence[i];
00085         }
00086     }
00087     if(src->geneVariance.size()>0){
00088         geneVariance.resize(src->geneVariance.size());
00089         utype i;
00090         for(i=0; i<src->geneVariance.size(); i++){
00091             geneVariance[i] = src->geneVariance[i];
00092         }
00093         //copy(src->geneVariance.begin(), src->geneVariance.end(),
00094         //  geneVariance.begin());
00095     }
00096     m_fDsetAverage = src->m_fDsetAverage;
00097     m_fDsetStdev = src->m_fDsetStdev;
00098 
00099     if(geneMap!=NULL){
00100         delete geneMap;
00101         iNumGenes = 0;
00102     }
00103 
00104     utype iSize = genePresence.size();
00105     iNumGenes = iSize;
00106     geneMap = new CSeekIntIntMap(src->geneMap);
00107 
00108     return true;    
00109 }
00110 
00111 bool CSeekDataset::ReadDatasetAverageStdev(const string &strFileName){
00112     vector<float> t;
00113     CSeekTools::ReadArray(strFileName.c_str(), t);
00114     m_fDsetAverage = t[0];
00115     m_fDsetStdev = t[1];
00116     return true;
00117 }
00118 
00119 bool CSeekDataset::ReadGeneAverage(const string &strFileName){
00120     return CSeekTools::ReadArray(strFileName.c_str(), geneAverage);
00121 }
00122 
00123 bool CSeekDataset::ReadGenePresence(const string &strFileName){
00124     return CSeekTools::ReadArray(strFileName.c_str(), genePresence);
00125 }
00126 
00127 bool CSeekDataset::ReadGeneVariance(const string &strFileName){
00128     return CSeekTools::ReadArray(strFileName.c_str(), geneVariance);
00129 }
00130 
00131 
00132 bool CSeekDataset::InitializeGeneMap(){
00133     if(geneAverage.empty() || genePresence.empty()) {
00134         cerr << "Gene average or gene presence unread" << endl;
00135         return false;
00136     }
00137     utype i;
00138     utype iSize = genePresence.size();
00139     iNumGenes = iSize;
00140     geneMap = new CSeekIntIntMap(iSize);
00141     vector<char>::const_iterator iterGenePresence = genePresence.begin();
00142     vector<float>::const_iterator iterGeneAverage = geneAverage.begin();
00143 
00144     for(i=0; iterGenePresence!=genePresence.end(); i++,
00145         iterGenePresence++, iterGeneAverage++){
00146         if(*iterGenePresence==1 && !CMeta::IsNaN(*iterGeneAverage)){
00147             geneMap->Add(i);
00148         }
00149     }
00150     return true;
00151 }
00152 
00153 
00154 bool CSeekDataset::InitializeQueryBlock(const vector<utype> &queryBlock){
00155 
00156     DeleteQueryBlock();
00157 
00158     dbMap = new CSeekIntIntMap(iNumGenes);
00159 
00160     vector<utype>::const_iterator iterQ = queryBlock.begin();
00161     for(; iterQ!=queryBlock.end(); iterQ++){
00162         if(CSeekTools::IsNaN(geneMap->GetForward(*iterQ))){
00163             //this query gene is not present in the dataset
00164             continue;
00165         }
00166         dbMap->Add(*iterQ);
00167     }
00168     iDBSize = dbMap->GetNumSet();
00169 
00170     bool DEBUG = false;
00171 
00172     if(iDBSize==0){
00173         if(DEBUG) cerr << "Dataset will be skipped" << endl;
00174         DeleteQueryBlock();
00175         return true;
00176     }
00177 
00178     //fprintf(stderr, "0x1 %lu %d %d\n", CMeta::GetMemoryUsage(), iDBSize, iNumGenes);
00179     r = CSeekTools::Init2DArray(iDBSize, iNumGenes, (unsigned char) 255);
00180     //fprintf(stderr, "0x2 %lu\n", CMeta::GetMemoryUsage());
00181     //free(r[0]);
00182     //free(r);
00183     //fprintf(stderr, "0x3 %lu\n", CMeta::GetMemoryUsage());
00184 
00185 
00186     return true;
00187 }
00188 
00189 
00190 bool CSeekDataset::InitializeQuery(const vector<utype> &query){
00191     DeleteQuery();
00192 
00193     if(iDBSize==0 || dbMap==NULL) return true;
00194     //must require initializequeryBlock be executed first
00195 
00196     queryMap = new CSeekIntIntMap(iNumGenes);
00197 
00198     utype i;
00199     vector<utype>::const_iterator iterQ = query.begin();
00200 
00201     vector<AResult> a;
00202     a.resize(query.size());
00203     vector<AResult>::iterator iterA = a.begin();
00204 
00205     for(iQuerySize = 0; iterQ!=query.end(); iterQ++){
00206         //if the query does not exist in this data-set, continue
00207         if(CSeekTools::IsNaN(i = dbMap->GetForward(*iterQ))) continue;
00208         //.i: query genes
00209         (*iterA).i = *iterQ;
00210         //.f: query gene position in dbMap
00211         (*iterA).f = i;
00212         iterA++;
00213         iQuerySize++;
00214     }
00215 
00216     bool DEBUG = false;
00217     if(iQuerySize==0){
00218         if(DEBUG) cerr << "Dataset will be skipped" << endl;
00219         DeleteQuery();
00220         return true;
00221     }
00222 
00223     a.resize(iQuerySize);
00224     sort(a.begin(), a.end(), Ascending());
00225 
00226     for(iterA = a.begin(); iterA!=a.end(); iterA++){
00227         //add gene to queryMap
00228         queryMap->Add((*iterA).i);
00229         this->query.push_back((*iterA).i);
00230         //add (gene-position in dbMap) to queryIndex
00231         this->queryIndex.push_back((*iterA).f);
00232     }
00233 
00234     this->queryIndex.resize(this->queryIndex.size());
00235     this->query.resize(this->query.size());
00236     return true;
00237 }
00238 
00239 
00240 bool CSeekDataset::DeleteQuery(){
00241     if(queryMap!=NULL){
00242         delete queryMap;
00243         queryMap = NULL;
00244     }
00245     iQuerySize = 0;
00246     rData = NULL;
00247     weight.clear();
00248     query.clear();
00249     queryIndex.clear();
00250     sum_weight = -1;
00251     return true;
00252 }
00253 
00254 
00255 bool CSeekDataset::DeleteQueryBlock(){
00256     DeleteQuery();
00257     if(dbMap!=NULL){
00258         delete dbMap;
00259         dbMap = NULL;
00260     }
00261     if(r!=NULL){
00262         CSeekTools::Free2DArray(r);
00263         r = NULL;
00264     }
00265     iDBSize = 0;
00266     return true;
00267 }
00268 
00269 const vector<utype>& CSeekDataset::GetQuery() const{
00270     return this->query;
00271 }
00272 
00273 const vector<utype>& CSeekDataset::GetQueryIndex() const{
00274     return this->queryIndex;
00275 }
00276 
00277 utype** CSeekDataset::GetDataMatrix(){
00278     return rData;
00279 }
00280 
00281 bool CSeekDataset::InitializeDataMatrix(utype **rD,
00282     const vector<float> &quant, const utype &iRows,
00283     const utype &iColumns, const bool bSubtractAvg,
00284     const bool bNormPlatform, const bool logit,
00285     const enum CSeekDataset::DistanceMeasure dist_measure,
00286     const float cutoff, 
00287     const bool bRandom, gsl_rng *rand){
00288     /* assume platform is already set */
00289     //Assuming rData is not NULL
00290 
00291     bool bCorrelation = false;
00292     if(dist_measure==CSeekDataset::CORRELATION)
00293         bCorrelation = true;
00294 
00295     if(bCorrelation && (bSubtractAvg || bNormPlatform)){
00296         fprintf(stderr, "%s, %s\n", "correlation mode enabled",
00297             "please set subtract_avg, subtract_platform to false");
00298         return false;
00299     }
00300 
00301     utype i, j;
00302     rData = rD;
00303     utype ii;
00304     utype iNumGenes = geneMap->GetNumSet();
00305     utype iNumQueries = iQuerySize;
00306 
00307     //fprintf(stderr, "iNumQueries is %d\n", iNumQueries);
00308     //iRows is the gene id, iColumns is the query id
00309     //default value for all rData entries is 0
00310     memset(&rData[0][0], 0, sizeof(utype)*iRows*iColumns);
00311 
00312     //assume queryIndex is already sorted
00313     vector<utype> offset;
00314     offset.push_back(0);
00315     for(i=1; i<queryIndex.size(); i++)
00316         offset.push_back(queryIndex[i] - queryIndex[i-1]);
00317 
00318     if(bSubtractAvg){ //z-score (Z) + average z score subtraction (A)
00319 
00320         if(bNormPlatform){ //subtract platform avg / platform stddev
00321             float *platform_avg = new float[iColumns];
00322             float *platform_stdev = new float[iColumns];
00323 
00324             //GetColumns() is numQuery
00325             for(j=0; j<iNumQueries; j++){
00326                 utype jj = this->query[j];
00327                 platform_avg[j] = platform->GetPlatformAvg(jj);
00328                 platform_stdev[j] = platform->GetPlatformStdev(jj);
00329             }
00330 
00331             const vector<utype> &allRGenes = geneMap->GetAllReverse();
00332             float a = 0;
00333             float vv = 0;
00334             unsigned char x = 0;
00335             utype iGeneMapSize = geneMap->GetNumSet();
00336             if(logit){ //NOT checked
00337                 for(ii=0; ii<iGeneMapSize; ii++){
00338                     for(j=0, i = allRGenes[ii], a=GetGeneAverage(i);
00339                         j<iNumQueries; j++){
00340                         if((x = r[queryIndex[j]][i])==255) continue;
00341                         vv = ((log(quant[x]) - log((float) 1.0 - quant[x]))
00342                             - a - platform_avg[j]) / platform_stdev[j];
00343                         vv = max((float) min(vv, (float)3.2), (float)-3.2);
00344                         //By default, cutoff = -nan (i.e., always true)
00345                         if(vv>cutoff){
00346                             rData[i][j]= (utype) (vv*100.0) + 320;
00347                             //fprintf(stderr, "%.5f %.5f %.5f %.5f\n", quant[x], vv, a, platform_avg[j]);
00348                         }else{
00349                             rData[i][j] = 0;
00350                         }
00351                     }
00352                 }
00353             }else{ //if just normal score
00354                 for(ii=0; ii<iGeneMapSize; ii++){
00355                     for(j=0, i = allRGenes[ii], a=GetGeneAverage(i);
00356                         j<iNumQueries; j++){
00357                         if((x = r[queryIndex[j]][i])==255) continue;
00358                         vv = (quant[x] - a - platform_avg[j])
00359                             / platform_stdev[j];
00360                         vv = max((float) min(vv, (float)3.2), (float)-3.2);
00361                         if(vv>cutoff){
00362                             rData[i][j]= (utype) (vv*100.0) + 320;
00363                             //fprintf(stderr, "r %.2f\n", quant[x]);
00364                         }else{
00365                             rData[i][j]= 0;
00366                         }
00367                     }
00368                 }
00369             }
00370 
00371             delete[] platform_avg;
00372             delete[] platform_stdev;
00373 
00374         }else{ //do not normalize by platform, just subtract gene avg 
00375             if(logit){
00376                 for(ii=0; ii<iNumGenes; ii++){
00377                     float a, vv; unsigned char x;
00378                     for(i = geneMap->GetReverse(ii),
00379                         a = GetGeneAverage(i), j=0; j<iNumQueries; j++){
00380                         if((x = r[queryIndex[j]][i])==255) continue;
00381                         vv = log(quant[x]) - log((float)(1.0-quant[x])) - a;
00382                         vv = max((float)-3.2, (float)min((float) 3.2, (float) vv));
00383                         //fprintf(stderr, "%.5f %.5f %.5f\n", quant[x], vv, a);
00384                         if(vv>cutoff){
00385                             rData[i][j]= (utype) (vv*100.0) + 320;
00386                         }else{
00387                             rData[i][j] = 0;
00388                         }
00389                     }
00390                 }
00391             }
00392             else{
00393                 for(ii=0; ii<iNumGenes; ii++){
00394                     float a, vv; unsigned char x;
00395                     /* numQueries */
00396                     for(i = geneMap->GetReverse(ii),
00397                         a = GetGeneAverage(i), j=0; j<iNumQueries; j++){
00398                         if((x = r[queryIndex[j]][i])==255) continue;
00399                         vv = quant[x] - a;
00400                         vv = max((float) min((float)vv, (float)3.2), (float)-3.2);
00401                         if(vv>cutoff){
00402                             rData[i][j]= (utype) (vv*100.0) + 320;
00403                         }else{
00404                             rData[i][j] = 0;
00405                         }
00406                     }
00407                 }
00408             }
00409         }
00410 
00411         //return true;
00412     }
00413     /* numGenes */
00414     else if(logit){ //logit on z-scores
00415         for(ii=0; ii<iNumGenes; ii++){
00416             unsigned char x;
00417             for(i = geneMap->GetReverse(ii), j=0; j<iNumQueries; j++){
00418                 if((x = r[queryIndex[j]][i])==255) continue;
00419                 float vv = log(quant[x]) - log((float) 1.0 - quant[x]);
00420                 vv = max((float) min(vv, (float)3.2), (float)-3.2);
00421                 if(vv>cutoff){
00422                     rData[i][j] = (utype) (vv*100.0) + 320;
00423                 }else{
00424                     rData[i][j] = 0;
00425                 }
00426             }
00427         }
00428     }else if(bCorrelation){ //correlation mode
00429         //assumed to be from -1 to +1
00430 
00431         for(ii=0; ii<iNumGenes; ii++){
00432             unsigned char x;
00433             for(i = geneMap->GetReverse(ii), j=0; j<iNumQueries; j++){
00434                 if((x = r[queryIndex[j]][i])==255) continue;
00435                 float vv = quant[x];
00436                 vv = vv * m_fDsetStdev + m_fDsetAverage;
00437                 float e = exp(2.0*vv);
00438                 vv = (e - 1.0) / (e + 1.0);
00439                 if(vv>cutoff){
00440                     vv = vv * 3.0; //scale up by factor of 3 for retaining 
00441                                    //precision, should put value to range 
00442                                    //(-3.0 to 3.0)
00443                     vv = max((float) min(vv, (float)3.2), (float)-3.2);
00444                     rData[i][j] = (utype) (vv*100.0) + 320;
00445                 }else{
00446                     rData[i][j] = (utype) 0;    //default value for 
00447                                                 //not meeting cutoff
00448                 }
00449             }
00450         }
00451 
00452     }else{ //just simple z-scores, no subtraction of avg z scores
00453         for(ii=0; ii<iNumGenes; ii++){
00454             unsigned char x;
00455             for(i = geneMap->GetReverse(ii), j=0; j<iNumQueries; j++){
00456                 if((x = r[queryIndex[j]][i])==255) continue;
00457                 float vv = quant[x];
00458 
00459                 //for functional network=================================
00460                 //vv = vv * 6.0 - 3.0; //transform values from 0-1 to values from -3 to +3
00461 
00462                 vv = max((float) min(vv, (float)3.2), (float)-3.2);
00463                 if(vv>cutoff){
00464                     rData[i][j] = (utype) (vv*100.0) + 320;
00465                 }else{
00466                     rData[i][j] = 0;
00467                 }
00468             }
00469         }
00470     }
00471 
00472     if(bRandom){
00473         vector<vector<utype> > allRandom;
00474         allRandom.resize(iNumQueries);
00475         for(i=0; i<iNumQueries; i++){
00476             allRandom[i] = vector<utype>();
00477         }
00478 
00479         for(ii=0; ii<iNumGenes; ii++){
00480             unsigned char x;
00481             for(i = geneMap->GetReverse(ii), j=0; j<iNumQueries; j++){
00482                 if((x = r[queryIndex[j]][i])==255) continue;
00483                 allRandom[j].push_back(rData[i][j]);
00484             }
00485         }
00486 
00487         int max_size = 0;
00488         for(i=0; i<iNumQueries; i++){
00489             if(allRandom[i].size()>max_size){
00490                 max_size = allRandom[i].size();
00491             }
00492         }
00493 
00494         utype **a = CSeekTools::Init2DArray(iNumQueries, max_size, (utype)0);
00495 
00496         for(i=0; i<iNumQueries; i++){
00497             for(j=0; j<allRandom[i].size(); j++){
00498                 a[i][j] = allRandom[i][j];
00499             }
00500             gsl_ran_shuffle(rand, a[i], allRandom[i].size(), sizeof(utype));
00501         }
00502 
00503         vector<int> k;
00504         CSeekTools::InitVector(k, iNumQueries, (int) 0);
00505 
00506         for(ii=0; ii<iNumGenes; ii++){
00507             unsigned char x;
00508             for(i = geneMap->GetReverse(ii), j=0; j<iNumQueries; j++){
00509                 if((x = r[queryIndex[j]][i])==255) continue;
00510                 //fprintf(stderr, "%d %d\n", rData[i][j], a[j][k[j]]);
00511                 rData[i][j] = a[j][k[j]];
00512                 k[j]++;
00513             }
00514         }
00515         
00516         CSeekTools::Free2DArray(a);
00517     }
00518 
00519 
00520     return true;
00521 }
00522 
00523 
00524 unsigned char** CSeekDataset::GetMatrix(){
00525     return r;
00526 }
00527 
00528 CSeekIntIntMap* CSeekDataset::GetGeneMap(){
00529     return geneMap;
00530 }
00531 
00532 CSeekIntIntMap* CSeekDataset::GetQueryMap(){
00533     return queryMap;
00534 }
00535 
00536 CSeekIntIntMap* CSeekDataset::GetDBMap(){
00537     return dbMap;
00538 }
00539 
00540 float CSeekDataset::GetDatasetAverage() const{
00541     return m_fDsetAverage;
00542 }
00543 
00544 float CSeekDataset::GetDatasetStdev() const{
00545     return m_fDsetStdev;
00546 }
00547 
00548 float CSeekDataset::GetGeneVariance(const utype &i) const{
00549     return geneVariance[i];
00550 }
00551 
00552 float CSeekDataset::GetGeneAverage(const utype &i) const{
00553     return geneAverage[i];
00554 }
00555 
00556 utype CSeekDataset::GetNumGenes() const{
00557     return iNumGenes;
00558 }
00559 
00560 bool CSeekDataset::InitializeCVWeight(const utype &i){
00561     weight.clear();
00562     weight.resize(i);
00563     sum_weight = -1;
00564     return true;
00565 }
00566 
00567 bool CSeekDataset::SetCVWeight(const utype &i, const float &f){
00568     weight[i] = f;
00569     return true;
00570 }
00571 
00572 float CSeekDataset::GetCVWeight(const utype &i){
00573     return weight[i];
00574 }
00575 
00576 const vector<float>& CSeekDataset::GetCVWeight() const{
00577     return weight;
00578 }
00579 
00580 float CSeekDataset::GetDatasetSumWeight(){
00581     utype i;
00582     utype num = 0;
00583     if(sum_weight==-1){
00584         for(sum_weight=0, i=0; i<weight.size(); i++){
00585             if(weight[i]==-1) continue;
00586             sum_weight+=weight[i];
00587             num++;
00588         }
00589         if(num>0) sum_weight/=(float)num;
00590         else sum_weight = -1;
00591     }
00592     return sum_weight;
00593 }
00594 
00595 void CSeekDataset::SetPlatform(CSeekPlatform &cp){
00596     platform = &cp;
00597 }
00598 
00599 CSeekPlatform& CSeekDataset::GetPlatform() const{
00600     return *platform;
00601 }
00602 
00603 }