Sleipnir
src/seekwriter.h
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #ifndef SEEKWRITER_H
00023 #define SEEKWRITER_H
00024 
00025 #include "seekbasic.h"
00026 #include "seekmap.h"
00027 #include "seekevaluate.h"
00028 #include "sparsematrix.h"
00029 #include "datapair.h"
00030 #include "seekreader.h"
00031 #include "strassen.h"
00032 
00033 namespace Sleipnir {
00034 
00035 class CSeekWriter{
00036 public:
00037 
00038     //should be either unsigned short or utype
00039     template<class tType>
00040     static bool ReadSeekSparseMatrixHeader(const char *fileName,
00041     CSeekIntIntMap &m){
00042         FILE *f = fopen(fileName, "rb");
00043         if(f==NULL){
00044             cerr << "File not found" << endl;
00045             return false;
00046         }
00047         size_t j;
00048         tType val, numPresent;
00049         int ret;
00050 
00051         //m need to be initialized to size vecstrGenes.size() first!
00052         ret = fread((char*) (&numPresent), 1, sizeof(numPresent), f);
00053         for(j=0; j<numPresent; j++){
00054             ret = fread((char*)(&val), 1, sizeof(val), f); //val stores the present gene
00055             m.Add((utype) val);
00056         }
00057         fclose(f);
00058         return true;
00059     }
00060 
00061     //compatibility
00062     template<class tType>
00063     static bool ReadSeekSparseMatrix(const char *fileName,
00064     CSparseFlatMatrix<float> &mat, CSeekIntIntMap &m, const int maxRank, 
00065     const float rbp_p, const vector<string> &vecstrGenes){
00066     
00067         FILE *f = fopen(fileName, "rb");
00068         if(f==NULL){
00069             cerr << "File not found" << endl;
00070             return false;
00071         }
00072 
00073         size_t i, j;
00074         tType numGenes, numPresent, val;
00075         int ret;
00076 
00077         mat.Initialize(vecstrGenes.size());
00078         ret = fread((char*) (&numPresent), 1, sizeof(numPresent), f);
00079         for(j=0; j<numPresent; j++){
00080             ret = fread((char*)(&val), 1, sizeof(val), f); //val = gene ID
00081             m.Add((utype) val);
00082             mat.InitializeRow(val, maxRank*2); //initial capacity
00083         }
00084         ret = fread((char*) (&numGenes), 1, sizeof(numGenes), f);
00085 
00086         vector<float> rbp_score;
00087         rbp_score.resize(maxRank);
00088         for(i=0; i<maxRank; i++)
00089             rbp_score[i] = (1.0 - rbp_p) * pow(rbp_p, i);
00090 
00091         for(i=0; i<numGenes; i++){
00092             tType id, id2;  //gene ID
00093             unsigned short numEntries, val; //rank
00094             ret = fread((char*)(&id), 1, sizeof(id), f);
00095             ret = fread((char*)(&numEntries), 1, sizeof(numEntries), f);
00096             for(j=0; j<numEntries; j++){
00097                 ret = fread((char*)(&id2),1,sizeof(id2),f);
00098                 ret = fread((char*)(&val),1,sizeof(val),f);
00099                 tType first = id;
00100                 tType second = id2;
00101                 //mat is a full matrix
00102                 mat.Add(first, second, rbp_score[val]);
00103                 mat.Add(second, first, rbp_score[val]);
00104             }
00105         }
00106         fclose(f);
00107 
00108         mat.Organize();
00109         size_t ii, jj;
00110         const vector<utype> &allRGenes = m.GetAllReverse();
00111         fprintf(stderr, "Begin calculating row sum\n");
00112 
00113         vector<float> vecSum;
00114         CSeekTools::InitVector(vecSum, vecstrGenes.size(), (float) 0);
00115         for(ii=0; ii<m.GetNumSet(); ii++){
00116             i = (size_t) allRGenes[ii];
00117             const vector<CPair<float> > &row = mat.GetRow(i);
00118             for(jj=0; jj<row.size(); jj++)
00119                 vecSum[i] += row[jj].v;
00120         }
00121 
00122         vector<float> vecSqrtSum;
00123         CSeekTools::InitVector(vecSqrtSum, vecstrGenes.size(), (float) 0);
00124 
00125         for(ii=0; ii<m.GetNumSet(); ii++){
00126             i = (size_t) allRGenes[ii];
00127             if(vecSum[i]==0) continue;
00128             vecSqrtSum[i] = sqrtf(vecSum[i]);
00129         }
00130 
00131         fprintf(stderr, "Begin normalization using row sum\n");
00132         float rv;
00133         #pragma omp parallel for \
00134         shared(m, mat, allRGenes, vecSqrtSum) \
00135         private(ii, i, j, rv) schedule(dynamic)
00136         for(ii=0; ii<m.GetNumSet(); ii++){
00137             i = (size_t) allRGenes[ii];
00138             vector<CPair<float> >::iterator row_it;
00139             for(row_it = mat.RowBegin(i); row_it!=mat.RowEnd(i); row_it++){
00140                 j = (size_t) row_it->i;
00141                 rv = row_it->v;
00142                 if(vecSqrtSum[i]==0 || vecSqrtSum[j]==0) continue;
00143                 row_it->v = rv / vecSqrtSum[i] / vecSqrtSum[j];
00144             }
00145         }
00146         return true;
00147     }
00148 
00149     //compatibility
00150     template<class tType>
00151     static bool RemoveDominant(CSparseFlatMatrix<float> &mat, 
00152     CSeekIntIntMap &m, const vector<string> &vecstrGenes){
00153     
00154         size_t i, j;
00155         vector<vector<float> > tmp_mat, tmp2;
00156         tmp_mat.resize(vecstrGenes.size());
00157         tmp2.resize(vecstrGenes.size());
00158         for(i=0; i<vecstrGenes.size(); i++){
00159             tmp_mat[i].resize(vecstrGenes.size());
00160             tmp2[i].resize(vecstrGenes.size());
00161             for(j=0; j<vecstrGenes.size(); j++){
00162                 tmp_mat[i][j] = 0;
00163                 tmp2[i][j] = 0;
00164             }
00165         }
00166 
00167         size_t ii, jj;
00168         const vector<utype> &allRGenes = m.GetAllReverse();
00169         for(ii=0; ii<m.GetNumSet(); ii++){
00170             i = (size_t) allRGenes[ii];
00171             vector<CPair<float> >::iterator row_it;
00172             for(row_it=mat.RowBegin(i); row_it!=mat.RowEnd(i); row_it++){
00173                 j = (size_t) row_it->i;
00174                 tmp_mat[i][j] = row_it->v;
00175             }
00176         }
00177 
00178         int TOP = 100;
00179         fprintf(stderr, "Started removing dominant...\n");
00180         for(ii=0; ii<m.GetNumSet(); ii++){
00181             i = (size_t) allRGenes[ii];
00182             vector<CPair<float> > vp;
00183             vp.resize(m.GetNumSet());
00184             for(jj=0; jj<m.GetNumSet(); jj++){
00185                 j = (size_t) allRGenes[jj];
00186                 vp[jj].i = (utype) j;
00187                 vp[jj].v = tmp_mat[i][j];
00188             }
00189             nth_element(vp.begin(), vp.begin()+TOP, vp.end(), CDescendingValue<float>());
00190             sort(vp.begin(), vp.begin()+TOP, CDescendingValue<float>());
00191 
00192             //top 100
00193             size_t k, l;
00194             size_t max_ind = 0;
00195             float max_val = -1;
00196             for(k=0; k<TOP; k++){
00197                 size_t this_g = vp[k].i;
00198                 float v = 0;
00199                 for(l=0; l<TOP; l++){
00200                     size_t other_g = vp[l].i;
00201                     if(this_g==other_g) continue;
00202                     v+=tmp_mat[this_g][other_g];
00203                 }
00204                 if(v>max_val){
00205                     max_val = v;
00206                     max_ind = k;
00207                 }
00208             }
00209             for(jj=0; jj<m.GetNumSet(); jj++){
00210                 j = (size_t) allRGenes[jj];
00211                 tmp2[i][j] = tmp_mat[i][j] - tmp_mat[j][max_ind];
00212                 if(tmp2[i][j]<0)    
00213                     tmp2[i][j] = 0;
00214             }
00215         }
00216 
00217         fprintf(stderr, "Started re-normalizing matrix...\n");
00218 
00219         for(ii=0; ii<m.GetNumSet(); ii++){
00220             i = (size_t) allRGenes[ii];
00221             for(jj=ii+1; jj<m.GetNumSet(); jj++){
00222                 j = (size_t) allRGenes[jj];
00223                 float m = max(tmp2[i][j], tmp2[j][i]);
00224                 tmp2[i][j] = m;
00225                 tmp2[j][i] = m;
00226             }
00227         }
00228 
00229         vector<float> vecSum;
00230         CSeekTools::InitVector(vecSum, vecstrGenes.size(), (float) 0);
00231         for(ii=0; ii<m.GetNumSet(); ii++){
00232             i = (size_t) allRGenes[ii];
00233             for(jj=0; jj<m.GetNumSet(); jj++){
00234                 j = (size_t) allRGenes[jj];
00235                 vecSum[i] += tmp2[i][j];
00236             }
00237         }
00238 
00239         vector<float> vecSqrtSum;
00240         CSeekTools::InitVector(vecSqrtSum, vecstrGenes.size(), (float) 0);
00241         for(ii=0; ii<m.GetNumSet(); ii++){
00242             i = (size_t) allRGenes[ii];
00243             if(vecSum[i]==0) continue;
00244             vecSqrtSum[i] = sqrtf(vecSum[i]);
00245         }
00246 
00247         for(ii=0; ii<m.GetNumSet(); ii++){
00248             i = (size_t) allRGenes[ii];
00249             vector<CPair<float> >::iterator row_it;
00250             for(row_it=mat.RowBegin(i); row_it!=mat.RowEnd(i); row_it++){
00251                 j = (size_t) row_it->i;
00252                 if(vecSqrtSum[i]==0 || vecSqrtSum[j]==0) continue;
00253                 row_it->v = tmp2[i][j] / vecSqrtSum[i] / vecSqrtSum[j];
00254                 //fprintf(stderr, "%d %d %.3e\n", i, j, row_it->v);
00255             }
00256         }
00257         return true;
00258     }
00259 
00260     //compatibility
00261     template<class tType>
00262     static bool WriteSparseMatrix(CDataPair &Dat, 
00263     vector<map<tType,unsigned short> > &umat, 
00264     const vector<string> &vecstrGenes, const char *fileName){
00265 
00266         FILE *f = fopen(fileName, "wb");
00267         if(f==NULL){
00268             fprintf(stderr, "File not found %s\n", fileName);
00269             return false;
00270         }
00271 
00272         size_t i, j;
00273         vector<tType> veciGenes;
00274         veciGenes.resize(vecstrGenes.size());
00275         for(i=0; i<vecstrGenes.size(); i++)
00276             veciGenes[i] = (tType) Dat.GetGeneIndex(vecstrGenes[i]);
00277 
00278         CSeekIntIntMap mm(vecstrGenes.size());
00279         for(i=0; i<vecstrGenes.size(); i++)
00280             if(veciGenes[i]!=(tType)-1)
00281                 mm.Add((utype)i);
00282 
00283         tType numPresent = (tType) mm.GetNumSet();  
00284         //1 tType
00285         fwrite((char*)(&numPresent), 1, sizeof(numPresent), f);
00286         const vector<utype> &allR = mm.GetAllReverse();
00287         //numPresent tType
00288         for(i=0; i<numPresent; i++){
00289             tType pr = (tType) allR[i];
00290             fwrite((char*)(&pr), 1, sizeof(pr), f);
00291         }
00292 
00293         tType numGenes = 0;
00294         for(i=0; i<vecstrGenes.size(); i++){
00295             if(umat[i].size()==0) continue;
00296             numGenes++;
00297         }
00298         //1 tType
00299         fwrite((char*) (&numGenes), 1, sizeof(numGenes), f);
00300 
00301         for(i=0; i<vecstrGenes.size(); i++){
00302             unsigned short numEntries = umat[i].size(); //should be 1000
00303             if(numEntries==0) 
00304                 continue;
00305             //1 tType
00306             tType gi = (tType) i;
00307             fwrite((char*) (&gi), 1, sizeof(gi), f);
00308             //1 unsigned short
00309             fwrite((char*) (&numEntries), 1, sizeof(numEntries), f);
00310             typename map<tType,unsigned short>::iterator it;
00311             for(it=umat[i].begin(); it!=umat[i].end(); it++){
00312                 tType first = it->first;
00313                 unsigned short second = it->second;
00314                 //1 tType
00315                 fwrite((char*) (&first), 1, sizeof(first), f);
00316                 //1 unsigned short
00317                 fwrite((char*) (&second), 1, sizeof(second), f);
00318             }
00319         }
00320         fclose(f);
00321         return true;
00322     }
00323 
00324     //compatiblity
00325     template<class tType>
00326     static bool GetSparseRankMatrix(CDataPair &Dat, 
00327     vector<map<tType,unsigned short> > &umat, int maxRank, 
00328     const vector<string> &vecstrGenes){
00329     
00330         size_t i, j;
00331         vector<tType> veciGenes;
00332         veciGenes.resize(vecstrGenes.size());
00333         for( i = 0; i < vecstrGenes.size( ); ++i )
00334             veciGenes[ i ] = (tType) Dat.GetGeneIndex( vecstrGenes[i] );
00335         umat.resize(vecstrGenes.size());
00336         for(i=0; i<vecstrGenes.size(); i++)
00337             umat[i] = map<tType, unsigned short>();
00338 
00339         fprintf(stderr, "Start reading DAB...\n");
00340         tType s, t;
00341         for(i=0; i<vecstrGenes.size(); i++){
00342             if((s=veciGenes[i])==(tType)-1) continue;
00343             if(i%1000==0) fprintf(stderr, "Start reading gene %d...\n", i);
00344 
00345             float *v = Dat.GetFullRow(s);
00346             vector<AResultFloat> vv;
00347             vv.resize(vecstrGenes.size());
00348 
00349             for(j=0; j<vecstrGenes.size(); j++){
00350                 vv[j].i = (utype) j;
00351                 vv[j].f = -9999;
00352                 if((t=veciGenes[j])==(tType)-1) continue;
00353                 if(CMeta::IsNaN(v[t])) continue;
00354                 vv[j].f = v[t];
00355             }
00356             nth_element(vv.begin(), vv.begin()+maxRank, vv.end());
00357             sort(vv.begin(), vv.begin()+maxRank);
00358 
00359             for(j=0; j<vecstrGenes.size(); j++){
00360                 if(j<maxRank){
00361                     tType first = (tType) i;
00362                     tType second = (tType) vv[j].i;
00363                     if((tType) i >= (tType) vv[j].i){
00364                         first = (tType) vv[j].i;
00365                         second = (tType) i;
00366                     }
00367                     typename map<tType,unsigned short>::iterator it;
00368                     if((it=umat[first].find(second))==umat[first].end())
00369                         umat[first][second] = (unsigned short) j;
00370                     else
00371                         umat[first][second] = std::min(it->second, (unsigned short) j);
00372                 }
00373             }
00374             delete[] v;
00375         }
00376         fprintf(stderr, "Finished reading DAB\n");
00377         return true;
00378     }
00379 
00380     //To be used after NormalizeDAB
00381     template<class tType>
00382     static bool ConvertToSparseMatrix(CDataPair &Dat,
00383     vector<map<tType,unsigned short> > &umat,
00384     const vector<string> &vecstrGenes, const float cutoff_val){
00385 
00386         size_t i, j;
00387         vector<tType> veciGenes;
00388         veciGenes.resize(vecstrGenes.size());
00389         for( i = 0; i < vecstrGenes.size( ); ++i )
00390             veciGenes[ i ] = (tType) Dat.GetGeneIndex( vecstrGenes[i] );
00391         umat.resize(vecstrGenes.size());
00392         for(i=0; i<vecstrGenes.size(); i++)
00393             umat[i] = map<tType, unsigned short>();
00394 
00395         tType s,t;
00396         for(i=0; i<vecstrGenes.size(); i++){
00397             if((s=veciGenes[i])==(tType)-1) continue;
00398             if(i%1000==0) fprintf(stderr, "Start reading gene %d...\n", i);
00399 
00400             for(j=i+1; j<vecstrGenes.size(); j++){
00401                 if((t=veciGenes[j])==(tType)-1) continue;
00402                 float r = Dat.Get(s,t);
00403                 if(CMeta::IsNaN(r)) continue;
00404                 if(r > cutoff_val)
00405                     umat[i][j] = (unsigned short) (r * 100.0);
00406             }
00407         }
00408         fprintf(stderr, "Finished reading DAB\n");
00409         return true;
00410     }
00411 
00412     //to be used for sparse matrix created from cutting-off z-scores
00413     template<class tType>
00414     static bool ReadSeekSparseMatrix(const char *fileName,
00415     CSparseFlatMatrix<float> &mat, CSeekIntIntMap &m, 
00416     const vector<string> &vecstrGenes, const int initialCapacity, 
00417     const float exponent){
00418     
00419         if(exponent<1.0){
00420             fprintf(stderr, "Exponent must be >=1.0\n");
00421             return false;
00422         }
00423     
00424         FILE *f = fopen(fileName, "rb");
00425         if(f==NULL){
00426             cerr << "File not found" << endl;
00427             return false;
00428         }
00429 
00430         size_t i, j;
00431         tType numGenes, numPresent, val;
00432         int ret;
00433 
00434         mat.Initialize(vecstrGenes.size());
00435         ret = fread((char*) (&numPresent), 1, sizeof(numPresent), f);
00436         for(j=0; j<numPresent; j++){
00437             ret = fread((char*)(&val), 1, sizeof(val), f); //val = gene ID
00438             m.Add((utype) val);
00439             mat.InitializeRow(val, initialCapacity); //initial capacity
00440         }
00441         ret = fread((char*) (&numGenes), 1, sizeof(numGenes), f);
00442 
00443         for(i=0; i<numGenes; i++){
00444             tType id, id2;  //gene ID
00445             unsigned short numEntries, val; //z-scores * 100.0
00446             ret = fread((char*)(&id), 1, sizeof(id), f);
00447             ret = fread((char*)(&numEntries), 1, sizeof(numEntries), f);
00448             for(j=0; j<numEntries; j++){
00449                 ret = fread((char*)(&id2),1,sizeof(id2),f);
00450                 ret = fread((char*)(&val),1,sizeof(val),f);
00451                 tType first = id;
00452                 tType second = id2;
00453                 float fval = (float) val / 100.0;
00454                 if(exponent>1.0)
00455                     fval = pow(fval, exponent);
00456                 mat.Add(first, second, fval);
00457                 mat.Add(second, first, fval);
00458             }
00459         }
00460         fclose(f);
00461 
00462         mat.Organize();
00463         return true;
00464     }
00465 
00466     //===============================================================
00467     //not currently used
00468     static bool ReadSparseMatrix(const char *fileName, 
00469         vector<map<utype,float> > &mat, 
00470         CSeekIntIntMap &m, const int maxRank, const float rbp_p,
00471         const vector<string> &vecstrGenes);
00472 
00473     //not currently used
00474     static bool ProductNorm(const vector<map<utype,float> > &mat1,
00475         const vector<map<utype,float> > &mat2, const CSeekIntIntMap &m1, 
00476         const CSeekIntIntMap &m2, vector<map<utype,float> > &re);
00477 
00478     //================================================================
00479     static bool SumSparseMatrix(CSparseFlatMatrix<float> &mat1,
00480         CSparseFlatHalfMatrix<float> &res, const CSeekIntIntMap &mi, const float w);
00481 
00482     static bool SumSparseMatrix(CSparseFlatMatrix<float> &mat1,
00483         CHalfMatrix<float> &res, const CSeekIntIntMap &mi, const float w);
00484 
00485     static bool NormalizeDAB(CDataPair &Dat, const vector<string> &vecstrGenes,
00486         //bool cutoff, float cutoff_val, 
00487         bool expTransform, bool divideNorm, bool subtractNorm);
00488 
00489     static bool GetGeneAverage(CDataPair &Dat,
00490         const vector<string> &vecstrGenes,
00491         vector<float> &vecResult, bool logit=false, float top_percent=1.0);
00492     static bool GetGenePresence(CDataPair &Dat,
00493         const vector<string> &vecstrGenes,
00494         vector<char> &vecResult);
00495     static bool GetDatasetSinfo(CDataPair &Dat, float &mean,
00496         float &stdev);
00497 
00498     static bool TopologicalOverlap(CDataPair &Dat,
00499         const vector<string> &vecstrGenes);
00500 };
00501 
00502 }
00503 #endif