Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #ifndef SEEKWRITER_H 00023 #define SEEKWRITER_H 00024 00025 #include "seekbasic.h" 00026 #include "seekmap.h" 00027 #include "seekevaluate.h" 00028 #include "sparsematrix.h" 00029 #include "datapair.h" 00030 #include "seekreader.h" 00031 #include "strassen.h" 00032 00033 namespace Sleipnir { 00034 00035 class CSeekWriter{ 00036 public: 00037 00038 //should be either unsigned short or utype 00039 template<class tType> 00040 static bool ReadSeekSparseMatrixHeader(const char *fileName, 00041 CSeekIntIntMap &m){ 00042 FILE *f = fopen(fileName, "rb"); 00043 if(f==NULL){ 00044 cerr << "File not found" << endl; 00045 return false; 00046 } 00047 size_t j; 00048 tType val, numPresent; 00049 int ret; 00050 00051 //m need to be initialized to size vecstrGenes.size() first! 00052 ret = fread((char*) (&numPresent), 1, sizeof(numPresent), f); 00053 for(j=0; j<numPresent; j++){ 00054 ret = fread((char*)(&val), 1, sizeof(val), f); //val stores the present gene 00055 m.Add((utype) val); 00056 } 00057 fclose(f); 00058 return true; 00059 } 00060 00061 //compatibility 00062 template<class tType> 00063 static bool ReadSeekSparseMatrix(const char *fileName, 00064 CSparseFlatMatrix<float> &mat, CSeekIntIntMap &m, const int maxRank, 00065 const float rbp_p, const vector<string> &vecstrGenes){ 00066 00067 FILE *f = fopen(fileName, "rb"); 00068 if(f==NULL){ 00069 cerr << "File not found" << endl; 00070 return false; 00071 } 00072 00073 size_t i, j; 00074 tType numGenes, numPresent, val; 00075 int ret; 00076 00077 mat.Initialize(vecstrGenes.size()); 00078 ret = fread((char*) (&numPresent), 1, sizeof(numPresent), f); 00079 for(j=0; j<numPresent; j++){ 00080 ret = fread((char*)(&val), 1, sizeof(val), f); //val = gene ID 00081 m.Add((utype) val); 00082 mat.InitializeRow(val, maxRank*2); //initial capacity 00083 } 00084 ret = fread((char*) (&numGenes), 1, sizeof(numGenes), f); 00085 00086 vector<float> rbp_score; 00087 rbp_score.resize(maxRank); 00088 for(i=0; i<maxRank; i++) 00089 rbp_score[i] = (1.0 - rbp_p) * pow(rbp_p, i); 00090 00091 for(i=0; i<numGenes; i++){ 00092 tType id, id2; //gene ID 00093 unsigned short numEntries, val; //rank 00094 ret = fread((char*)(&id), 1, sizeof(id), f); 00095 ret = fread((char*)(&numEntries), 1, sizeof(numEntries), f); 00096 for(j=0; j<numEntries; j++){ 00097 ret = fread((char*)(&id2),1,sizeof(id2),f); 00098 ret = fread((char*)(&val),1,sizeof(val),f); 00099 tType first = id; 00100 tType second = id2; 00101 //mat is a full matrix 00102 mat.Add(first, second, rbp_score[val]); 00103 mat.Add(second, first, rbp_score[val]); 00104 } 00105 } 00106 fclose(f); 00107 00108 mat.Organize(); 00109 size_t ii, jj; 00110 const vector<utype> &allRGenes = m.GetAllReverse(); 00111 fprintf(stderr, "Begin calculating row sum\n"); 00112 00113 vector<float> vecSum; 00114 CSeekTools::InitVector(vecSum, vecstrGenes.size(), (float) 0); 00115 for(ii=0; ii<m.GetNumSet(); ii++){ 00116 i = (size_t) allRGenes[ii]; 00117 const vector<CPair<float> > &row = mat.GetRow(i); 00118 for(jj=0; jj<row.size(); jj++) 00119 vecSum[i] += row[jj].v; 00120 } 00121 00122 vector<float> vecSqrtSum; 00123 CSeekTools::InitVector(vecSqrtSum, vecstrGenes.size(), (float) 0); 00124 00125 for(ii=0; ii<m.GetNumSet(); ii++){ 00126 i = (size_t) allRGenes[ii]; 00127 if(vecSum[i]==0) continue; 00128 vecSqrtSum[i] = sqrtf(vecSum[i]); 00129 } 00130 00131 fprintf(stderr, "Begin normalization using row sum\n"); 00132 float rv; 00133 #pragma omp parallel for \ 00134 shared(m, mat, allRGenes, vecSqrtSum) \ 00135 private(ii, i, j, rv) schedule(dynamic) 00136 for(ii=0; ii<m.GetNumSet(); ii++){ 00137 i = (size_t) allRGenes[ii]; 00138 vector<CPair<float> >::iterator row_it; 00139 for(row_it = mat.RowBegin(i); row_it!=mat.RowEnd(i); row_it++){ 00140 j = (size_t) row_it->i; 00141 rv = row_it->v; 00142 if(vecSqrtSum[i]==0 || vecSqrtSum[j]==0) continue; 00143 row_it->v = rv / vecSqrtSum[i] / vecSqrtSum[j]; 00144 } 00145 } 00146 return true; 00147 } 00148 00149 //compatibility 00150 template<class tType> 00151 static bool RemoveDominant(CSparseFlatMatrix<float> &mat, 00152 CSeekIntIntMap &m, const vector<string> &vecstrGenes){ 00153 00154 size_t i, j; 00155 vector<vector<float> > tmp_mat, tmp2; 00156 tmp_mat.resize(vecstrGenes.size()); 00157 tmp2.resize(vecstrGenes.size()); 00158 for(i=0; i<vecstrGenes.size(); i++){ 00159 tmp_mat[i].resize(vecstrGenes.size()); 00160 tmp2[i].resize(vecstrGenes.size()); 00161 for(j=0; j<vecstrGenes.size(); j++){ 00162 tmp_mat[i][j] = 0; 00163 tmp2[i][j] = 0; 00164 } 00165 } 00166 00167 size_t ii, jj; 00168 const vector<utype> &allRGenes = m.GetAllReverse(); 00169 for(ii=0; ii<m.GetNumSet(); ii++){ 00170 i = (size_t) allRGenes[ii]; 00171 vector<CPair<float> >::iterator row_it; 00172 for(row_it=mat.RowBegin(i); row_it!=mat.RowEnd(i); row_it++){ 00173 j = (size_t) row_it->i; 00174 tmp_mat[i][j] = row_it->v; 00175 } 00176 } 00177 00178 int TOP = 100; 00179 fprintf(stderr, "Started removing dominant...\n"); 00180 for(ii=0; ii<m.GetNumSet(); ii++){ 00181 i = (size_t) allRGenes[ii]; 00182 vector<CPair<float> > vp; 00183 vp.resize(m.GetNumSet()); 00184 for(jj=0; jj<m.GetNumSet(); jj++){ 00185 j = (size_t) allRGenes[jj]; 00186 vp[jj].i = (utype) j; 00187 vp[jj].v = tmp_mat[i][j]; 00188 } 00189 nth_element(vp.begin(), vp.begin()+TOP, vp.end(), CDescendingValue<float>()); 00190 sort(vp.begin(), vp.begin()+TOP, CDescendingValue<float>()); 00191 00192 //top 100 00193 size_t k, l; 00194 size_t max_ind = 0; 00195 float max_val = -1; 00196 for(k=0; k<TOP; k++){ 00197 size_t this_g = vp[k].i; 00198 float v = 0; 00199 for(l=0; l<TOP; l++){ 00200 size_t other_g = vp[l].i; 00201 if(this_g==other_g) continue; 00202 v+=tmp_mat[this_g][other_g]; 00203 } 00204 if(v>max_val){ 00205 max_val = v; 00206 max_ind = k; 00207 } 00208 } 00209 for(jj=0; jj<m.GetNumSet(); jj++){ 00210 j = (size_t) allRGenes[jj]; 00211 tmp2[i][j] = tmp_mat[i][j] - tmp_mat[j][max_ind]; 00212 if(tmp2[i][j]<0) 00213 tmp2[i][j] = 0; 00214 } 00215 } 00216 00217 fprintf(stderr, "Started re-normalizing matrix...\n"); 00218 00219 for(ii=0; ii<m.GetNumSet(); ii++){ 00220 i = (size_t) allRGenes[ii]; 00221 for(jj=ii+1; jj<m.GetNumSet(); jj++){ 00222 j = (size_t) allRGenes[jj]; 00223 float m = max(tmp2[i][j], tmp2[j][i]); 00224 tmp2[i][j] = m; 00225 tmp2[j][i] = m; 00226 } 00227 } 00228 00229 vector<float> vecSum; 00230 CSeekTools::InitVector(vecSum, vecstrGenes.size(), (float) 0); 00231 for(ii=0; ii<m.GetNumSet(); ii++){ 00232 i = (size_t) allRGenes[ii]; 00233 for(jj=0; jj<m.GetNumSet(); jj++){ 00234 j = (size_t) allRGenes[jj]; 00235 vecSum[i] += tmp2[i][j]; 00236 } 00237 } 00238 00239 vector<float> vecSqrtSum; 00240 CSeekTools::InitVector(vecSqrtSum, vecstrGenes.size(), (float) 0); 00241 for(ii=0; ii<m.GetNumSet(); ii++){ 00242 i = (size_t) allRGenes[ii]; 00243 if(vecSum[i]==0) continue; 00244 vecSqrtSum[i] = sqrtf(vecSum[i]); 00245 } 00246 00247 for(ii=0; ii<m.GetNumSet(); ii++){ 00248 i = (size_t) allRGenes[ii]; 00249 vector<CPair<float> >::iterator row_it; 00250 for(row_it=mat.RowBegin(i); row_it!=mat.RowEnd(i); row_it++){ 00251 j = (size_t) row_it->i; 00252 if(vecSqrtSum[i]==0 || vecSqrtSum[j]==0) continue; 00253 row_it->v = tmp2[i][j] / vecSqrtSum[i] / vecSqrtSum[j]; 00254 //fprintf(stderr, "%d %d %.3e\n", i, j, row_it->v); 00255 } 00256 } 00257 return true; 00258 } 00259 00260 //compatibility 00261 template<class tType> 00262 static bool WriteSparseMatrix(CDataPair &Dat, 00263 vector<map<tType,unsigned short> > &umat, 00264 const vector<string> &vecstrGenes, const char *fileName){ 00265 00266 FILE *f = fopen(fileName, "wb"); 00267 if(f==NULL){ 00268 fprintf(stderr, "File not found %s\n", fileName); 00269 return false; 00270 } 00271 00272 size_t i, j; 00273 vector<tType> veciGenes; 00274 veciGenes.resize(vecstrGenes.size()); 00275 for(i=0; i<vecstrGenes.size(); i++) 00276 veciGenes[i] = (tType) Dat.GetGeneIndex(vecstrGenes[i]); 00277 00278 CSeekIntIntMap mm(vecstrGenes.size()); 00279 for(i=0; i<vecstrGenes.size(); i++) 00280 if(veciGenes[i]!=(tType)-1) 00281 mm.Add((utype)i); 00282 00283 tType numPresent = (tType) mm.GetNumSet(); 00284 //1 tType 00285 fwrite((char*)(&numPresent), 1, sizeof(numPresent), f); 00286 const vector<utype> &allR = mm.GetAllReverse(); 00287 //numPresent tType 00288 for(i=0; i<numPresent; i++){ 00289 tType pr = (tType) allR[i]; 00290 fwrite((char*)(&pr), 1, sizeof(pr), f); 00291 } 00292 00293 tType numGenes = 0; 00294 for(i=0; i<vecstrGenes.size(); i++){ 00295 if(umat[i].size()==0) continue; 00296 numGenes++; 00297 } 00298 //1 tType 00299 fwrite((char*) (&numGenes), 1, sizeof(numGenes), f); 00300 00301 for(i=0; i<vecstrGenes.size(); i++){ 00302 unsigned short numEntries = umat[i].size(); //should be 1000 00303 if(numEntries==0) 00304 continue; 00305 //1 tType 00306 tType gi = (tType) i; 00307 fwrite((char*) (&gi), 1, sizeof(gi), f); 00308 //1 unsigned short 00309 fwrite((char*) (&numEntries), 1, sizeof(numEntries), f); 00310 typename map<tType,unsigned short>::iterator it; 00311 for(it=umat[i].begin(); it!=umat[i].end(); it++){ 00312 tType first = it->first; 00313 unsigned short second = it->second; 00314 //1 tType 00315 fwrite((char*) (&first), 1, sizeof(first), f); 00316 //1 unsigned short 00317 fwrite((char*) (&second), 1, sizeof(second), f); 00318 } 00319 } 00320 fclose(f); 00321 return true; 00322 } 00323 00324 //compatiblity 00325 template<class tType> 00326 static bool GetSparseRankMatrix(CDataPair &Dat, 00327 vector<map<tType,unsigned short> > &umat, int maxRank, 00328 const vector<string> &vecstrGenes){ 00329 00330 size_t i, j; 00331 vector<tType> veciGenes; 00332 veciGenes.resize(vecstrGenes.size()); 00333 for( i = 0; i < vecstrGenes.size( ); ++i ) 00334 veciGenes[ i ] = (tType) Dat.GetGeneIndex( vecstrGenes[i] ); 00335 umat.resize(vecstrGenes.size()); 00336 for(i=0; i<vecstrGenes.size(); i++) 00337 umat[i] = map<tType, unsigned short>(); 00338 00339 fprintf(stderr, "Start reading DAB...\n"); 00340 tType s, t; 00341 for(i=0; i<vecstrGenes.size(); i++){ 00342 if((s=veciGenes[i])==(tType)-1) continue; 00343 if(i%1000==0) fprintf(stderr, "Start reading gene %d...\n", i); 00344 00345 float *v = Dat.GetFullRow(s); 00346 vector<AResultFloat> vv; 00347 vv.resize(vecstrGenes.size()); 00348 00349 for(j=0; j<vecstrGenes.size(); j++){ 00350 vv[j].i = (utype) j; 00351 vv[j].f = -9999; 00352 if((t=veciGenes[j])==(tType)-1) continue; 00353 if(CMeta::IsNaN(v[t])) continue; 00354 vv[j].f = v[t]; 00355 } 00356 nth_element(vv.begin(), vv.begin()+maxRank, vv.end()); 00357 sort(vv.begin(), vv.begin()+maxRank); 00358 00359 for(j=0; j<vecstrGenes.size(); j++){ 00360 if(j<maxRank){ 00361 tType first = (tType) i; 00362 tType second = (tType) vv[j].i; 00363 if((tType) i >= (tType) vv[j].i){ 00364 first = (tType) vv[j].i; 00365 second = (tType) i; 00366 } 00367 typename map<tType,unsigned short>::iterator it; 00368 if((it=umat[first].find(second))==umat[first].end()) 00369 umat[first][second] = (unsigned short) j; 00370 else 00371 umat[first][second] = std::min(it->second, (unsigned short) j); 00372 } 00373 } 00374 delete[] v; 00375 } 00376 fprintf(stderr, "Finished reading DAB\n"); 00377 return true; 00378 } 00379 00380 //To be used after NormalizeDAB 00381 template<class tType> 00382 static bool ConvertToSparseMatrix(CDataPair &Dat, 00383 vector<map<tType,unsigned short> > &umat, 00384 const vector<string> &vecstrGenes, const float cutoff_val){ 00385 00386 size_t i, j; 00387 vector<tType> veciGenes; 00388 veciGenes.resize(vecstrGenes.size()); 00389 for( i = 0; i < vecstrGenes.size( ); ++i ) 00390 veciGenes[ i ] = (tType) Dat.GetGeneIndex( vecstrGenes[i] ); 00391 umat.resize(vecstrGenes.size()); 00392 for(i=0; i<vecstrGenes.size(); i++) 00393 umat[i] = map<tType, unsigned short>(); 00394 00395 tType s,t; 00396 for(i=0; i<vecstrGenes.size(); i++){ 00397 if((s=veciGenes[i])==(tType)-1) continue; 00398 if(i%1000==0) fprintf(stderr, "Start reading gene %d...\n", i); 00399 00400 for(j=i+1; j<vecstrGenes.size(); j++){ 00401 if((t=veciGenes[j])==(tType)-1) continue; 00402 float r = Dat.Get(s,t); 00403 if(CMeta::IsNaN(r)) continue; 00404 if(r > cutoff_val) 00405 umat[i][j] = (unsigned short) (r * 100.0); 00406 } 00407 } 00408 fprintf(stderr, "Finished reading DAB\n"); 00409 return true; 00410 } 00411 00412 //to be used for sparse matrix created from cutting-off z-scores 00413 template<class tType> 00414 static bool ReadSeekSparseMatrix(const char *fileName, 00415 CSparseFlatMatrix<float> &mat, CSeekIntIntMap &m, 00416 const vector<string> &vecstrGenes, const int initialCapacity, 00417 const float exponent){ 00418 00419 if(exponent<1.0){ 00420 fprintf(stderr, "Exponent must be >=1.0\n"); 00421 return false; 00422 } 00423 00424 FILE *f = fopen(fileName, "rb"); 00425 if(f==NULL){ 00426 cerr << "File not found" << endl; 00427 return false; 00428 } 00429 00430 size_t i, j; 00431 tType numGenes, numPresent, val; 00432 int ret; 00433 00434 mat.Initialize(vecstrGenes.size()); 00435 ret = fread((char*) (&numPresent), 1, sizeof(numPresent), f); 00436 for(j=0; j<numPresent; j++){ 00437 ret = fread((char*)(&val), 1, sizeof(val), f); //val = gene ID 00438 m.Add((utype) val); 00439 mat.InitializeRow(val, initialCapacity); //initial capacity 00440 } 00441 ret = fread((char*) (&numGenes), 1, sizeof(numGenes), f); 00442 00443 for(i=0; i<numGenes; i++){ 00444 tType id, id2; //gene ID 00445 unsigned short numEntries, val; //z-scores * 100.0 00446 ret = fread((char*)(&id), 1, sizeof(id), f); 00447 ret = fread((char*)(&numEntries), 1, sizeof(numEntries), f); 00448 for(j=0; j<numEntries; j++){ 00449 ret = fread((char*)(&id2),1,sizeof(id2),f); 00450 ret = fread((char*)(&val),1,sizeof(val),f); 00451 tType first = id; 00452 tType second = id2; 00453 float fval = (float) val / 100.0; 00454 if(exponent>1.0) 00455 fval = pow(fval, exponent); 00456 mat.Add(first, second, fval); 00457 mat.Add(second, first, fval); 00458 } 00459 } 00460 fclose(f); 00461 00462 mat.Organize(); 00463 return true; 00464 } 00465 00466 //=============================================================== 00467 //not currently used 00468 static bool ReadSparseMatrix(const char *fileName, 00469 vector<map<utype,float> > &mat, 00470 CSeekIntIntMap &m, const int maxRank, const float rbp_p, 00471 const vector<string> &vecstrGenes); 00472 00473 //not currently used 00474 static bool ProductNorm(const vector<map<utype,float> > &mat1, 00475 const vector<map<utype,float> > &mat2, const CSeekIntIntMap &m1, 00476 const CSeekIntIntMap &m2, vector<map<utype,float> > &re); 00477 00478 //================================================================ 00479 static bool SumSparseMatrix(CSparseFlatMatrix<float> &mat1, 00480 CSparseFlatHalfMatrix<float> &res, const CSeekIntIntMap &mi, const float w); 00481 00482 static bool SumSparseMatrix(CSparseFlatMatrix<float> &mat1, 00483 CHalfMatrix<float> &res, const CSeekIntIntMap &mi, const float w); 00484 00485 static bool NormalizeDAB(CDataPair &Dat, const vector<string> &vecstrGenes, 00486 //bool cutoff, float cutoff_val, 00487 bool expTransform, bool divideNorm, bool subtractNorm); 00488 00489 static bool GetGeneAverage(CDataPair &Dat, 00490 const vector<string> &vecstrGenes, 00491 vector<float> &vecResult, bool logit=false, float top_percent=1.0); 00492 static bool GetGenePresence(CDataPair &Dat, 00493 const vector<string> &vecstrGenes, 00494 vector<char> &vecResult); 00495 static bool GetDatasetSinfo(CDataPair &Dat, float &mean, 00496 float &stdev); 00497 00498 static bool TopologicalOverlap(CDataPair &Dat, 00499 const vector<string> &vecstrGenes); 00500 }; 00501 00502 } 00503 #endif