Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "pcl.h" 00024 #include "meta.h" 00025 #include "statistics.h" 00026 #include "genome.h" 00027 #include "measure.h" 00028 #include "dat.h" 00029 00030 namespace Sleipnir { 00031 00032 const char CPCLImpl::c_szEWEIGHT[] = "EWEIGHT"; 00033 const char CPCLImpl::c_szGENE[] = "GENE"; 00034 const char CPCLImpl::c_szGID[] = "GID"; 00035 const char CPCLImpl::c_szGWEIGHT[] = "GWEIGHT"; 00036 const char CPCLImpl::c_szNAME[] = "NAME"; 00037 const char CPCLImpl::c_szOne[] = "1"; 00038 const char CPCLImpl::c_szExtension[] = ".pcl"; 00039 const char CPCLImpl::c_szBinExtension[] = ".bin"; 00040 const char CPCLImpl::c_szDabExtension[] = ".dab"; 00041 00042 struct SNeighbors { 00043 CFullMatrix<pair<size_t, float> > m_MatNeighbors; 00044 vector<size_t> m_veciMin; 00045 vector<size_t> m_veciColumns; 00046 00047 void Initialize(const float* adValues, size_t iValues, size_t iNeighbors) { 00048 size_t i, j; 00049 00050 m_veciColumns.clear(); 00051 for (i = 0; i < iValues; ++i) 00052 if (CMeta::IsNaN(adValues[i])) 00053 m_veciColumns.push_back(i); 00054 00055 m_veciMin.resize(m_veciColumns.size()); 00056 m_MatNeighbors.Initialize(iNeighbors, m_veciColumns.size()); 00057 for (i = 0; i < m_MatNeighbors.GetRows(); ++i) 00058 for (j = 0; j < m_MatNeighbors.GetColumns(); ++j) 00059 m_MatNeighbors.Get(i, j).second = -FLT_MAX; 00060 } 00061 00062 bool Add(size_t iNeighbor, float dSim, const float* adValues) { 00063 size_t i, j, iCol; 00064 bool fRet; 00065 00066 if (!m_MatNeighbors.GetRows()) 00067 return false; 00068 for (fRet = false, i = 0; i < m_veciColumns.size(); ++i) { 00069 iCol = m_veciColumns[i]; 00070 if (!CMeta::IsNaN(adValues[iCol]) && (dSim > m_MatNeighbors.Get( 00071 m_veciMin[i], i).second)) { 00072 fRet = true; 00073 m_MatNeighbors.Get(m_veciMin[i], i).first = iNeighbor; 00074 m_MatNeighbors.Get(m_veciMin[i], i).second = dSim; 00075 00076 for (m_veciMin[i] = 0, j = 1; j < m_MatNeighbors.GetRows(); ++j) 00077 if (m_MatNeighbors.Get(j, i).second < m_MatNeighbors.Get( 00078 m_veciMin[i], i).second) 00079 m_veciMin[i] = j; 00080 } 00081 } 00082 00083 return fRet; 00084 } 00085 00086 size_t GetColumn(size_t iColumn) const { 00087 size_t i; 00088 00089 for (i = 0; i < m_veciColumns.size(); ++i) 00090 if (m_veciColumns[i] == iColumn) 00091 return i; 00092 00093 return -1; 00094 } 00095 }; 00096 00165 int CPCL::Distance(const char* szFile, size_t iSkip, 00166 const char* szSimilarityMeasure, bool fNormalize, bool fZScore, 00167 bool fAutocorrelate, const char* szGeneFile, float dCutoff, 00168 size_t iLimit, CPCL& PCL, CDat& Dat, IMeasure::EMap eMap, 00169 bool fFrequencyWeight, float dAlpha, int nThreads) { 00170 00171 00172 size_t i, j, iOne, iTwo; 00173 float d; 00174 ifstream ifsm; 00175 vector<string> vecstrGenes; 00176 CGenome Genome; 00177 CGenes GenesIn(Genome); 00178 vector<size_t> veciGenes; 00179 const float* adOne; 00180 const float* adWeights; 00181 vector<float> vecdWeights; 00182 IMeasure* pMeasure; 00183 CMeasurePearson Pearson; 00184 CMeasureEuclidean Euclidean; 00185 CMeasureKendallsTau KendallsTau; 00186 CMeasureKolmogorovSmirnov KolmSmir; 00187 CMeasureSpearman Spearman(true); 00188 CMeasurePearNorm PearNorm; 00189 CMeasureHypergeometric Hypergeom; 00190 CMeasureQuickPearson PearQuick; 00191 CMeasureInnerProduct InnerProd; 00192 CMeasureBinaryInnerProduct BinInnerProd; 00193 CMeasureMutualInformation MutualInfo; 00194 CMeasureRelativeAUC RelAuc; 00195 CMeasurePearsonSignificance PearSig; 00196 CMeasureDice Dice( dAlpha ); 00197 CMeasureDistanceCorrelation DCor; 00198 CMeasureSignedDistanceCorrelation SDCor; 00199 if (szFile) { 00200 g_CatSleipnir().debug("Opening PCL for distance"); 00201 if (!PCL.Open(szFile, iSkip, false, false)) { 00202 g_CatSleipnir().error( 00203 "CPCL::Distance( %s, %d, %s, %d, %d, %d, %s, %g ) failed to open PCL", 00204 szFile, iSkip, szSimilarityMeasure, fNormalize, fZScore, 00205 fAutocorrelate, szGeneFile ? szGeneFile : "", dCutoff); 00206 return 1; 00207 } 00208 ifsm.close(); 00209 } else if (!PCL.Open(cin, iSkip)) { 00210 g_CatSleipnir().error( 00211 "CPCL::Distance( %s, %d, %s, %d, %d, %d, %s, %g ) failed to open PCL", 00212 "stdin", iSkip, szSimilarityMeasure, fNormalize, fZScore, 00213 fAutocorrelate, szGeneFile ? szGeneFile : "", dCutoff); 00214 return 1; 00215 } 00216 if (!szSimilarityMeasure) 00217 return 0; 00218 00219 CMeasureSigmoid 00220 EuclideanSig(&Euclidean, false, 1.0f / PCL.GetExperiments()); 00221 IMeasure* apMeasures[] = { &Pearson, &EuclideanSig, &KendallsTau, 00222 &KolmSmir, &Spearman, &PearNorm, &Hypergeom, &PearQuick, 00223 &InnerProd, &BinInnerProd, &MutualInfo, &RelAuc, &PearSig, &Dice,&DCor,&SDCor, NULL }; 00224 00225 pMeasure = NULL; 00226 for (i = 0; apMeasures[i]; ++i) 00227 if (!strcmp(apMeasures[i]->GetName(), szSimilarityMeasure)) { 00228 pMeasure = apMeasures[i]; 00229 break; 00230 } 00231 if (!pMeasure) 00232 return 1; 00233 g_CatSleipnir().info("Method: %s ", pMeasure->GetName(), i); 00234 if (fFrequencyWeight) { 00235 vecdWeights.resize(PCL.GetExperiments()); 00236 for (i = 0; i < vecdWeights.size(); ++i) { 00237 for (iOne = j = 0; j < PCL.GetGenes(); ++j) 00238 if (!CMeta::IsNaN(d = PCL.Get(j, i)) && (d > 0)) 00239 iOne++; 00240 vecdWeights[i] = (float) ((PCL.GetGenes() + 1) - iOne) 00241 / PCL.GetGenes(); 00242 } 00243 } 00244 adWeights = vecdWeights.empty() ? NULL : &vecdWeights[0]; 00245 00246 CMeasureAutocorrelate Autocorrelate(pMeasure, false); 00247 if (fAutocorrelate) 00248 pMeasure = &Autocorrelate; 00249 00250 if (szGeneFile) { 00251 ifsm.clear(); 00252 ifsm.open(szGeneFile); 00253 if (!GenesIn.Open(ifsm)) { 00254 g_CatSleipnir().error( 00255 "CPCL::Distance( %s, %d, %s, %d, %d, %d, %s, %g ) failed to open genes", 00256 szFile ? szFile : "stdin", iSkip, szSimilarityMeasure, 00257 fNormalize, fZScore, fAutocorrelate, szGeneFile, dCutoff); 00258 return 1; 00259 } 00260 ifsm.close(); 00261 } else 00262 GenesIn.Open(PCL.GetGeneNames()); 00263 veciGenes.resize(GenesIn.GetGenes()); 00264 for (i = 0; i < veciGenes.size(); ++i) 00265 veciGenes[i] = szGeneFile ? PCL.GetGene(GenesIn.GetGene(i).GetName()) 00266 : i; 00267 00268 if (pMeasure->IsRank()) 00269 PCL.RankTransform(); 00270 00271 g_CatSleipnir().info("Number of Experiments: %d", PCL.GetExperiments()); 00272 00273 if ((iLimit != -1) && (PCL.GetGenes() > iLimit)) 00274 Dat.Open(PCL, pMeasure->Clone(), true); 00275 else { 00276 Dat.Open(GenesIn.GetGeneNames()); 00277 for (i = 0; i < Dat.GetGenes(); ++i) 00278 for (j = (i + 1); j < Dat.GetGenes(); ++j) 00279 Dat.Set(i, j, CMeta::GetNaN()); 00280 int origNThreads = omp_get_num_threads(); 00281 omp_set_num_threads(nThreads); 00282 for (i = 0; i < GenesIn.GetGenes(); ++i) { 00283 if (!(i % 100)) 00284 g_CatSleipnir().info( 00285 "CPCL::Distance( %s, %d, %s, %d, %d, %d, %s, %g ) processing gene %d/%d", 00286 szFile ? szFile : "stdin", iSkip, szSimilarityMeasure, 00287 fNormalize, fZScore, fAutocorrelate, 00288 szGeneFile ? szGeneFile : "", dCutoff, i, 00289 GenesIn.GetGenes()); 00290 if ((iOne = veciGenes[i]) == -1) 00291 continue; 00292 adOne = PCL.Get(iOne); 00293 for (j = (i + 1); j < GenesIn.GetGenes(); ++j){ 00294 if ((iTwo = veciGenes[j]) != -1){ 00295 Dat.Set(i, j, (float) pMeasure->Measure(adOne, 00296 PCL.GetExperiments(), PCL.Get(iTwo), 00297 PCL.GetExperiments(), eMap, adWeights, adWeights)); 00298 } 00299 } 00300 } 00301 omp_set_num_threads(origNThreads); 00302 if (fNormalize || fZScore) 00303 Dat.Normalize(fZScore ? CDat::ENormalizeZScore 00304 : CDat::ENormalizeMinMax); 00305 if (!CMeta::IsNaN(dCutoff)) 00306 for (i = 0; i < Dat.GetGenes(); ++i) 00307 for (j = (i + 1); j < Dat.GetGenes(); ++j) 00308 if (!CMeta::IsNaN(d = Dat.Get(i, j)) && (d < dCutoff)) 00309 Dat.Set(i, j, CMeta::GetNaN()); 00310 00311 if(pMeasure->GetName()=="pearson" || pMeasure->GetName()=="pearnorm"){ 00312 vector<unsigned long> bins; 00313 bins.resize(55); 00314 float upper = 0; 00315 float lower = 0; 00316 if(pMeasure->GetName()=="pearson"){ 00317 upper = 1.0; 00318 lower = -1.0; 00319 }else if(pMeasure->GetName()=="pearnorm"){ 00320 upper = 5.0; 00321 lower = -5.0; 00322 } 00323 float bin_size = (upper - lower) / 50; 00324 for(i=0; i<55; i++) 00325 bins[i] = 0; 00326 for(i=0; i<Dat.GetGenes(); i++){ 00327 for(j=i+1; j<Dat.GetGenes(); j++){ 00328 d = Dat.Get(i,j); 00329 if(CMeta::IsNaN(d)) continue; 00330 int b = (int) ((d - lower) / bin_size); 00331 if(b<0){ 00332 bins[0]++; 00333 continue; 00334 } 00335 if(b>=55){ 00336 bins[54]++; 00337 continue; 00338 } 00339 bins[b]++; 00340 } 00341 } 00342 g_CatSleipnir().info( 00343 "Distribution of distances: bin size: %.5f, number of bins: %d, min bin value: %.5f, max bin value: %.5f", 00344 bin_size, 55, lower, upper); 00345 for(i=0; i<55; i++){ 00346 g_CatSleipnir().info("%lu\t%lu", i, bins[i]); 00347 } 00348 } 00349 00350 } 00351 00352 return 0; 00353 } 00354 00426 int CPCL::Distance(const char* szFile, size_t iSkip, const char* szWeights, 00427 const char* szSimilarityMeasure, bool fNormalize, bool fZScore, 00428 bool fAutocorrelate, const char* szGeneFile, float dCutoff, 00429 size_t iLimit, CPCL& PCL, CDat& Dat, IMeasure::EMap eMap, 00430 bool fFrequencyWeight, float dAlpha, int nThreads) { 00431 size_t i, j, iOne, iTwo; 00432 float d; 00433 ifstream ifsm; 00434 vector<string> vecstrGenes; 00435 CGenome Genome; 00436 CGenes GenesIn(Genome); 00437 vector<size_t> veciGenes; 00438 const float* adOne; 00439 const float* adWeights; 00440 vector<float> vecdWeights; 00441 vector<string> vecstrWeights; 00442 string acStr; 00443 IMeasure* pMeasure; 00444 CMeasurePearson Pearson; 00445 CMeasureEuclidean Euclidean; 00446 CMeasureKendallsTau KendallsTau; 00447 CMeasureKolmogorovSmirnov KolmSmir; 00448 CMeasureSpearman Spearman(true); 00449 CMeasurePearNorm PearNorm; 00450 CMeasureHypergeometric Hypergeom; 00451 CMeasureQuickPearson PearQuick; 00452 CMeasureInnerProduct InnerProd; 00453 CMeasureBinaryInnerProduct BinInnerProd; 00454 CMeasureMutualInformation MutualInfo; 00455 CMeasureRelativeAUC RelAuc; 00456 CMeasurePearsonSignificance PearSig; 00457 CMeasureDice Dice( dAlpha ); 00458 CMeasureDistanceCorrelation DCor; 00459 CMeasureSignedDistanceCorrelation SDCor; 00460 if (szFile) { 00461 g_CatSleipnir().debug("Opening PCL for distance"); 00462 if (!PCL.Open(szFile, iSkip, false, false)) { 00463 g_CatSleipnir().error( 00464 "CPCL::Distance( %s, %d, %s, %d, %d, %d, %s, %g ) failed to open PCL", 00465 szFile, iSkip, szSimilarityMeasure, fNormalize, fZScore, 00466 fAutocorrelate, szGeneFile ? szGeneFile : "", dCutoff); 00467 return 1; 00468 } 00469 ifsm.close(); 00470 } else if (!PCL.Open(cin, iSkip)) { 00471 g_CatSleipnir().error( 00472 "CPCL::Distance( %s, %d, %s, %d, %d, %d, %s, %g ) failed to open PCL", 00473 "stdin", iSkip, szSimilarityMeasure, fNormalize, fZScore, 00474 fAutocorrelate, szGeneFile ? szGeneFile : "", dCutoff); 00475 return 1; 00476 } 00477 if (!szSimilarityMeasure) 00478 return 0; 00479 00480 CMeasureSigmoid 00481 EuclideanSig(&Euclidean, false, 1.0f / PCL.GetExperiments()); 00482 IMeasure* apMeasures[] = { &Pearson, &EuclideanSig, &KendallsTau, 00483 &KolmSmir, &Spearman, &PearNorm, &Hypergeom, &PearQuick, 00484 &InnerProd, &BinInnerProd, &MutualInfo, &RelAuc, &PearSig, &Dice,&DCor,&SDCor, NULL }; 00485 00486 pMeasure = NULL; 00487 for (i = 0; apMeasures[i]; ++i) 00488 if (!strcmp(apMeasures[i]->GetName(), szSimilarityMeasure)) { 00489 pMeasure = apMeasures[i]; 00490 break; 00491 } 00492 if (!pMeasure) 00493 return 1; 00494 g_CatSleipnir().info("Method: %s ", pMeasure->GetName(), i); 00495 if (fFrequencyWeight) { 00496 vecdWeights.resize(PCL.GetExperiments()); 00497 for (i = 0; i < vecdWeights.size(); ++i) { 00498 for (iOne = j = 0; j < PCL.GetGenes(); ++j) 00499 if (!CMeta::IsNaN(d = PCL.Get(j, i)) && (d > 0)) 00500 iOne++; 00501 vecdWeights[i] = (float) ((PCL.GetGenes() + 1) - iOne) 00502 / PCL.GetGenes(); 00503 } 00504 } 00505 adWeights = vecdWeights.empty() ? NULL : &vecdWeights[0]; 00506 00507 if ( adWeights == NULL ) { 00508 if (szWeights != NULL) { 00509 ifsm.open(szWeights); 00510 getline(ifsm, acStr); 00511 ifsm.close(); 00512 CMeta::Tokenize( acStr.c_str(), vecstrWeights, CMeta::c_szWS, true); 00513 if ( vecstrWeights.size() != PCL.GetExperiments()) { 00514 g_CatSleipnir().error( 00515 "CPCL::Distance( %s, %d, %s, %d, %d, %d, %s, %g ) length of weights file (%s, length: %d) did not match number of experiments (%d).", 00516 szFile, iSkip, szSimilarityMeasure, fNormalize, fZScore, 00517 fAutocorrelate, szGeneFile ? szGeneFile : "", dCutoff, szWeights, 00518 vecstrWeights.size(), PCL.GetExperiments()); 00519 return 1; 00520 } 00521 vecdWeights.resize(PCL.GetExperiments()); 00522 for (i = 0; i < vecstrWeights.size(); ++i) { 00523 vecdWeights[i] = atof(vecstrWeights[i].c_str()); 00524 } 00525 adWeights = vecdWeights.empty() ? NULL : &vecdWeights[0]; 00526 } 00527 } 00528 00529 CMeasureAutocorrelate Autocorrelate(pMeasure, false); 00530 if (fAutocorrelate) 00531 pMeasure = &Autocorrelate; 00532 00533 if (szGeneFile) { 00534 ifsm.clear(); 00535 ifsm.open(szGeneFile); 00536 if (!GenesIn.Open(ifsm)) { 00537 g_CatSleipnir().error( 00538 "CPCL::Distance( %s, %d, %s, %d, %d, %d, %s, %g ) failed to open genes", 00539 szFile ? szFile : "stdin", iSkip, szSimilarityMeasure, 00540 fNormalize, fZScore, fAutocorrelate, szGeneFile, dCutoff); 00541 return 1; 00542 } 00543 ifsm.close(); 00544 } else 00545 GenesIn.Open(PCL.GetGeneNames()); 00546 veciGenes.resize(GenesIn.GetGenes()); 00547 for (i = 0; i < veciGenes.size(); ++i) 00548 veciGenes[i] = szGeneFile ? PCL.GetGene(GenesIn.GetGene(i).GetName()) 00549 : i; 00550 00551 if (pMeasure->IsRank()) 00552 PCL.RankTransform(); 00553 00554 g_CatSleipnir().info("Number of Experiments: %d", PCL.GetExperiments()); 00555 00556 if ((iLimit != -1) && (PCL.GetGenes() > iLimit)) 00557 Dat.Open(PCL, pMeasure->Clone(), true); 00558 else { 00559 Dat.Open(GenesIn.GetGeneNames()); 00560 for (i = 0; i < Dat.GetGenes(); ++i) 00561 for (j = (i + 1); j < Dat.GetGenes(); ++j) 00562 Dat.Set(i, j, CMeta::GetNaN()); 00563 for (i = 0; i < GenesIn.GetGenes(); ++i) { 00564 if (!(i % 100)) 00565 g_CatSleipnir().info( 00566 "CPCL::Distance( %s, %d, %s, %d, %d, %d, %s, %g ) processing gene %d/%d", 00567 szFile ? szFile : "stdin", iSkip, szSimilarityMeasure, 00568 fNormalize, fZScore, fAutocorrelate, 00569 szGeneFile ? szGeneFile : "", dCutoff, i, 00570 GenesIn.GetGenes()); 00571 if ((iOne = veciGenes[i]) == -1) 00572 continue; 00573 adOne = PCL.Get(iOne); 00574 #pragma omp parallel for num_threads(nThreads) 00575 for (j = (i + 1); j < GenesIn.GetGenes(); ++j){ 00576 if ((iTwo = veciGenes[j]) != -1){ 00577 Dat.Set(i, j, (float) pMeasure->Measure(adOne, 00578 PCL.GetExperiments(), PCL.Get(iTwo), 00579 PCL.GetExperiments(), eMap, adWeights, adWeights)); 00580 } 00581 } 00582 } 00583 00584 if (fNormalize || fZScore) 00585 Dat.Normalize(fZScore ? CDat::ENormalizeZScore 00586 : CDat::ENormalizeMinMax); 00587 00588 if (!CMeta::IsNaN(dCutoff)) 00589 for (i = 0; i < Dat.GetGenes(); ++i) 00590 for (j = (i + 1); j < Dat.GetGenes(); ++j) 00591 if (!CMeta::IsNaN(d = Dat.Get(i, j)) && (d < dCutoff)) 00592 Dat.Set(i, j, CMeta::GetNaN()); 00593 00594 if(pMeasure->GetName()=="pearson" || pMeasure->GetName()=="pearnorm"){ 00595 vector<unsigned long> bins; 00596 bins.resize(55); 00597 float upper = 0; 00598 float lower = 0; 00599 if(pMeasure->GetName()=="pearson"){ 00600 upper = 1.0; 00601 lower = -1.0; 00602 }else if(pMeasure->GetName()=="pearnorm"){ 00603 upper = 5.0; 00604 lower = -5.0; 00605 } 00606 float bin_size = (upper - lower) / 50; 00607 for(i=0; i<55; i++) 00608 bins[i] = 0; 00609 for(i=0; i<Dat.GetGenes(); i++){ 00610 for(j=i+1; j<Dat.GetGenes(); j++){ 00611 d = Dat.Get(i,j); 00612 if(CMeta::IsNaN(d)) continue; 00613 int b = (int) ((d - lower) / bin_size); 00614 if(b<0){ 00615 bins[0]++; 00616 continue; 00617 } 00618 if(b>=55){ 00619 bins[54]++; 00620 continue; 00621 } 00622 bins[b]++; 00623 } 00624 } 00625 g_CatSleipnir().info( 00626 "Distribution of distances: bin size: %.5f, number of bins: %d, min bin value: %.5f, max bin value: %.5f", 00627 bin_size, 55, lower, upper); 00628 for(i=0; i<55; i++){ 00629 g_CatSleipnir().info("%lu\t%lu", i, bins[i]); 00630 } 00631 } 00632 } 00633 return 0; 00634 } 00635 00636 CPCLImpl::~CPCLImpl() { 00637 00638 Reset(); 00639 } 00640 00641 void CPCLImpl::Reset() { 00642 00643 m_Data.Reset(); 00644 m_vecstrGenes.clear(); 00645 m_vecstrExperiments.clear(); 00646 m_vecstrFeatures.clear(); 00647 m_vecvecstrFeatures.clear(); 00648 m_setiGenes.clear(); 00649 m_mapstriGenes.clear(); 00650 } 00651 00663 //bool CPCL::Open(const char* szFile, size_t iSkip, bool Memmap) { 00664 // std::ifstream ifsm; 00665 // if (szFile) 00666 // ifsm.open(szFile); 00667 // 00668 // // is this a binary binary file? 00669 // if (!strcmp(szFile + strlen(szFile) - strlen(c_szBinExtension), 00670 // c_szBinExtension)) 00671 // return OpenBinary(szFile ? ifsm : cin, Memmap); 00672 // else 00673 // // is this a text based PCL file? 00674 // return Open(szFile ? ifsm : cin, iSkip); 00675 //} 00676 00677 00678 bool CPCL::Open(const char* szFile, size_t iSkip, bool Memmap, bool rTable) { 00679 bool isBinary = false; 00680 bool isDAB = false; 00681 ifstream ifsm; 00682 if (!strcmp(szFile + strlen(szFile) - strlen(c_szBinExtension), c_szBinExtension)) { 00683 isBinary = true; 00684 } 00685 if (!strcmp(szFile + strlen(szFile) - strlen(c_szDabExtension), c_szDabExtension)) { 00686 isDAB = true; 00687 } 00688 if (isBinary && Memmap) { 00689 g_CatSleipnir().notice("CPCL::Open, openning with memory mapping"); 00690 Reset(); 00691 if (!CMeta::MapRead(m_abData, m_hndlData, m_iData, szFile)) { 00692 g_CatSleipnir().error("CPCL::Open( %s, %d ) failed memory mapping", 00693 szFile, Memmap); 00694 00695 return false; 00696 } 00697 bool fRet = OpenHelper(); 00698 return fRet; 00699 } 00700 else if (isBinary) { 00701 ifsm.open(szFile, ios::binary); 00702 return OpenBinary(ifsm); 00703 } else if (isDAB) { 00704 CDat dat; 00705 if (!dat.Open(szFile,false, 0, false, false)) { 00706 cerr << "Could not open dab to make pcl." << endl; 00707 return false; 00708 } 00709 std::vector<std::string> features; 00710 std::vector<std::string> genes = dat.GetGeneNames(); 00711 Open(genes, genes, features); 00712 for (size_t i = 0; i < dat.GetGenes(); i++) { 00713 for (size_t j = i; j < dat.GetGenes(); j++) { 00714 if (i == j) { 00715 Set(i, j, 1); 00716 } 00717 else { 00718 float pairVal = dat.Get(i,j); 00719 Set(i, j, pairVal); 00720 Set(j, i, pairVal); 00721 } 00722 } 00723 } 00724 return true; 00725 } else { 00726 ifstream ifsm; 00727 if (szFile) 00728 ifsm.open(szFile); 00729 return Open(szFile ? ifsm : cin, iSkip, rTable); 00730 } 00731 } 00732 00733 00734 00748 void CPCL::Save(const char* szFile) { 00749 std::ofstream ofsm; 00750 00751 if (szFile) 00752 ofsm.open(szFile); 00753 00754 // Save as binary PCL file? 00755 if (!strcmp(szFile + strlen(szFile) - strlen(c_szExtension), c_szBinExtension)) 00756 SaveBinary(szFile ? ofsm : cout); 00757 else 00758 Save(szFile ? ofsm : cout); 00759 // Save as text file 00760 00761 } 00762 00774 void CPCL::Open(const CPCL& PCL) { 00775 size_t i, j; 00776 TSetI::const_iterator iterGene; 00777 TMapStrI::const_iterator iterGene2; 00778 00779 Reset(); 00780 m_fHeader = PCL.m_fHeader; 00781 m_Data.Initialize(PCL.m_Data.GetRows(), PCL.m_Data.GetColumns()); 00782 for (i = 0; i < m_Data.GetRows(); ++i) 00783 for (j = 0; j < m_Data.GetColumns(); ++j) 00784 m_Data.Set(i, j, PCL.m_Data.Get(i, j)); 00785 00786 for (iterGene = PCL.m_setiGenes.begin(); iterGene != PCL.m_setiGenes.end(); ++iterGene) 00787 m_setiGenes.insert(*iterGene); 00788 m_vecstrExperiments.resize(PCL.m_vecstrExperiments.size()); 00789 copy(PCL.m_vecstrExperiments.begin(), PCL.m_vecstrExperiments.end(), 00790 m_vecstrExperiments.begin()); 00791 m_vecstrFeatures.resize(PCL.m_vecstrFeatures.size()); 00792 copy(PCL.m_vecstrFeatures.begin(), PCL.m_vecstrFeatures.end(), 00793 m_vecstrFeatures.begin()); 00794 m_vecstrGenes.resize(PCL.m_vecstrGenes.size()); 00795 copy(PCL.m_vecstrGenes.begin(), PCL.m_vecstrGenes.end(), 00796 m_vecstrGenes.begin()); 00797 for (iterGene2 = PCL.m_mapstriGenes.begin(); iterGene2 00798 != PCL.m_mapstriGenes.end(); ++iterGene2) 00799 m_mapstriGenes[iterGene2->first] = iterGene2->second; 00800 m_vecvecstrFeatures.resize(PCL.m_vecvecstrFeatures.size()); 00801 for (i = 0; i < m_vecvecstrFeatures.size(); ++i) { 00802 m_vecvecstrFeatures[i].resize(PCL.m_vecvecstrFeatures[i].size()); 00803 copy(PCL.m_vecvecstrFeatures[i].begin(), 00804 PCL.m_vecvecstrFeatures[i].end(), 00805 m_vecvecstrFeatures[i].begin()); 00806 } 00807 } 00808 00829 bool CPCL::Open(std::istream& istm, size_t iSkip, bool rTable) { 00830 vector<float> vecdData; 00831 size_t i; 00832 char* acBuf; 00833 bool fRet; 00834 string acStr; 00835 00836 if (!istm.good()) 00837 return false; 00838 00839 acStr.reserve(c_iBufferSize); 00840 00841 if (!OpenExperiments(istm, iSkip, acStr, rTable)) 00842 fRet = false; 00843 else { 00844 m_vecvecstrFeatures.resize(m_vecstrFeatures.size() - 1); 00845 while (OpenGene(istm, vecdData, acStr)) 00846 ; 00847 for (fRet = !!GetGenes(), i = 0; i < GetGenes(); ++i) 00848 if (GetGene(i).empty() || !isprint(GetGene(i)[0])) { 00849 fRet = false; 00850 g_CatSleipnir().error( 00851 "CPCL::Open( %d ) invalid gene at index %d: %s", iSkip, 00852 i, GetGene(i).c_str()); 00853 break; 00854 } 00855 if (fRet) { 00856 m_Data.Initialize(GetGenes(), GetExperiments()); 00857 for (i = 0; i < m_Data.GetRows(); ++i) 00858 m_Data.Set(i, &vecdData[i * m_Data.GetColumns()]); 00859 } 00860 } 00861 //delete[] acBuf; 00862 00863 return fRet; 00864 } 00865 00866 bool CPCLImpl::OpenHelper() { 00867 unsigned char* pb; 00868 pb = m_abData; 00869 size_t i, j; 00870 m_vecstrFeatures.resize(*(uint32_t*) pb); 00871 pb += sizeof(uint32_t); 00872 for (i = 0; i < m_vecstrFeatures.size(); ++i) { 00873 string& strFeature = m_vecstrFeatures[i]; 00874 strFeature.resize(*(uint32_t*) pb); 00875 pb += sizeof(uint32_t); 00876 for (j = 0; j < strFeature.size(); ++j) 00877 strFeature[j] = *pb++; 00878 } 00879 m_vecstrExperiments.resize(*(uint32_t*) pb); 00880 i = 0; 00881 pb += sizeof(uint32_t); 00882 for (i = 0; i < m_vecstrExperiments.size(); ++i) { 00883 string& strExperiment = m_vecstrExperiments[i]; 00884 strExperiment.resize(*(uint32_t*) pb); 00885 pb += sizeof(uint32_t); 00886 for (j = 0; j < strExperiment.size(); ++j) { 00887 strExperiment[j] = *pb++; 00888 } 00889 } 00890 m_vecstrGenes.resize(*(uint32_t*) pb); 00891 pb += sizeof(uint32_t); 00892 for (i = 0; i < m_vecstrGenes.size(); ++i) { 00893 string& strGene = m_vecstrGenes[i]; 00894 strGene.resize(*(uint32_t*) pb); 00895 pb += sizeof(uint32_t); 00896 for (j = 0; j < strGene.size(); ++j) 00897 strGene[j] = *pb++; 00898 m_mapstriGenes[strGene] = i; 00899 } 00900 return OpenMemmap(pb); 00901 00902 } 00903 00904 bool CPCLImpl::OpenMemmap(const unsigned char* pb) { 00905 size_t i; 00906 00907 m_aadData = new float*[m_vecstrGenes.size()]; 00908 m_aadData[0] = (float*) pb; 00909 for (i = 1; i < m_vecstrGenes.size(); ++i) 00910 m_aadData[i] = m_aadData[i - 1] + m_vecstrExperiments.size(); 00911 m_Data.Initialize(m_vecstrGenes.size(), m_vecstrExperiments.size(), (float**) m_aadData); 00912 return true; 00913 } 00914 00915 bool CPCLImpl::OpenExperiments(std::istream& istmInput, size_t iFeatures, 00916 string& acStr, bool rTable) { 00917 const char* pc; 00918 string strToken; 00919 size_t iToken; 00920 00921 Reset(); 00922 if (!m_fHeader) { 00923 m_vecstrFeatures.resize(1 + iFeatures); 00924 return true; 00925 } 00926 getline(istmInput, acStr); 00927 if (!acStr.size()) { 00928 return false; 00929 } 00930 00931 if(rTable){ 00932 m_vecstrFeatures.push_back(""); 00933 } 00934 00935 for (iToken = 0, pc = acStr.c_str(); (strToken = OpenToken(pc, &pc)).length() 00936 || *pc; ++iToken) 00937 if (iToken < iFeatures) 00938 m_vecstrFeatures.push_back(strToken); 00939 else if (iToken <= iFeatures && !rTable) 00940 m_vecstrFeatures.push_back(strToken); 00941 else 00942 m_vecstrExperiments.push_back(strToken); 00943 00944 return true; 00945 } 00946 00947 bool CPCLImpl::OpenGene(std::istream& istmInput, std::vector<float>& vecdData, 00948 string& acStr) { 00949 const char* pc; 00950 char* pcEnd; 00951 string strToken; 00952 size_t iToken, iData, iBase, i; 00953 float d; 00954 map<string, size_t> mapstriValues; 00955 map<string, size_t>::iterator iterValue; 00956 const char* acLine = acStr.c_str(); 00957 00958 iBase = vecdData.size(); 00959 getline(istmInput, acStr); 00960 if ((strToken = OpenToken(acLine)).empty()) 00961 return false; 00962 if (strToken == "EWEIGHT") 00963 return true; 00964 if (!m_vecstrExperiments.empty()) 00965 vecdData.resize(vecdData.size() + m_vecstrExperiments.size()); 00966 for (iData = iToken = 0, pc = acLine; (strToken = OpenToken(pc, &pc)).length() 00967 || *pc; ++iToken) { 00968 if (!iToken) { 00969 m_mapstriGenes[strToken] = m_vecstrGenes.size(); 00970 m_vecstrGenes.push_back(strToken); 00971 } else if (iToken < m_vecstrFeatures.size()) 00972 m_vecvecstrFeatures[iToken - 1].push_back(strToken); 00973 else if (!m_vecstrExperiments.empty() && (iData 00974 >= m_vecstrExperiments.size())) 00975 return false; 00976 else { 00977 d = CMeta::GetNaN(); 00978 strToken = CMeta::Trim(strToken.c_str()); 00979 if (strToken.length()) { 00980 d = (float) strtod(strToken.c_str(), &pcEnd); 00981 if (pcEnd != (strToken.c_str() + strToken.length())) { 00982 iterValue = mapstriValues.find(strToken); 00983 if (iterValue == mapstriValues.end()) { 00984 i = mapstriValues.size(); 00985 mapstriValues[strToken] = i; 00986 d = i; 00987 } else 00988 d = iterValue->second; 00989 } 00990 } 00991 if (m_vecstrExperiments.empty()) 00992 vecdData.push_back(d); 00993 else if ((i = (iBase + iData++)) >= vecdData.size()) 00994 return false; 00995 else 00996 vecdData[i] = d; 00997 } 00998 } 00999 01000 if (m_vecstrExperiments.empty()) 01001 m_vecstrExperiments.resize(vecdData.size()); 01002 else 01003 while (iData < m_vecstrExperiments.size()) 01004 vecdData[iBase + iData++] = CMeta::GetNaN(); 01005 01006 return !!iToken; 01007 } 01008 01026 void CPCL::SaveHeader(std::ostream& ostm, bool fCDT) const { 01027 size_t i; 01028 01029 if (!m_fHeader) 01030 return; 01031 01032 /* if (fCDT) 01033 ostm << c_szGID << '\t'; */ 01034 ostm << m_vecstrFeatures[0]; //Gene name 01035 // for (i = 1; i < m_vecstrFeatures.size(); ++i) 01036 // ostm << '\t' << m_vecstrFeatures[i]; 01037 for (i = 0; i < m_vecstrExperiments.size(); ++i) 01038 ostm << '\t' << m_vecstrExperiments[i]; 01039 ostm << endl; 01040 01041 // ostm << c_szEWEIGHT; 01042 // for (i = fCDT ? 0 : 1; i < m_vecstrFeatures.size(); ++i) 01043 // ostm << '\t'; 01044 // for (i = 0; i < m_vecstrExperiments.size(); ++i) 01045 // ostm << '\t' << 1; 01046 // ostm << endl; 01047 } 01048 01070 void CPCL::SaveGene(std::ostream& ostm, size_t iGene, size_t iOriginal) const { 01071 size_t i; 01072 float d; 01073 01074 if (iOriginal != -1) 01075 ostm << c_szGENE << iOriginal << '\t'; 01076 ostm << GetGene(iGene); 01077 for (i = 0; i < m_vecvecstrFeatures.size(); ++i) 01078 ostm << '\t' << m_vecvecstrFeatures[i][iGene]; 01079 for (i = 0; i < m_vecstrExperiments.size(); ++i) { 01080 ostm << '\t'; 01081 if (!CMeta::IsNaN(d = Get(iGene, i))) 01082 ostm << Get(iGene, i); 01083 } 01084 ostm << endl; 01085 } 01086 01108 //Save only certain genes 01109 bool CPCL::Save(const char* szFile, const std::vector<size_t>* pveciGenes) const { 01110 ofstream ofsm; 01111 if (!strcmp(szFile + strlen(szFile) - 4, ".bin")) { 01112 ofsm.open(szFile, ios::binary); 01113 SaveBinary(ofsm); 01114 } else { 01115 ofsm.open(szFile); 01116 Save(ofsm, pveciGenes); 01117 } 01118 01119 } 01120 void CPCL::Save(std::ostream& ostm, const std::vector<size_t>* pveciGenes) const { 01121 size_t i; 01122 01123 SaveHeader(ostm, !!pveciGenes); 01124 01125 for (i = 0; i < GetGenes(); ++i) { 01126 if (m_setiGenes.find(i) != m_setiGenes.end()) 01127 continue; 01128 SaveGene(ostm, i, pveciGenes ? (*pveciGenes)[i] : -1); 01129 } 01130 } 01131 01147 void CPCL::SaveBinary(std::ostream& ostm) const { 01148 uint32_t iTmp; 01149 size_t i; 01150 01151 iTmp = GetFeatures(); 01152 ostm.write((const char*) &iTmp, sizeof(iTmp)); 01153 for (i = 0; i < GetFeatures(); ++i) 01154 SaveString(ostm, GetFeature(i)); 01155 iTmp = GetExperiments(); 01156 ostm.write((const char*) &iTmp, sizeof(iTmp)); 01157 for (i = 0; i < GetExperiments(); ++i) 01158 SaveString(ostm, GetExperiment(i)); 01159 size_t iCount = 0; 01160 for (i = 0; i < GetGenes(); i++) 01161 if (!IsMasked(i)) 01162 iCount++; 01163 iTmp = iCount; 01164 ostm.write((const char*) &iTmp, sizeof(iTmp)); 01165 for (i = 0; i < GetGenes(); ++i) 01166 if (!IsMasked(i)) 01167 SaveString(ostm, GetGene(i)); 01168 01169 for (i = 0; i < GetGenes(); ++i) 01170 if (!IsMasked(i)) 01171 ostm.write((const char*) Get(i), GetExperiments() * sizeof(*Get(i))); 01172 } 01173 01184 bool CPCL::OpenBinary(std::istream& istm) { 01185 uint32_t iTmp; 01186 size_t i; 01187 01188 if (!istm.good()) 01189 return false; 01190 01191 01192 Reset(); 01193 istm.read((char*) &iTmp, sizeof(iTmp)); 01194 m_vecstrFeatures.resize(iTmp); 01195 for (i = 0; i < m_vecstrFeatures.size(); ++i) 01196 OpenString(istm, m_vecstrFeatures[i]); 01197 01198 istm.read((char*) &iTmp, sizeof(iTmp)); 01199 m_vecstrExperiments.resize(iTmp); 01200 for (i = 0; i < m_vecstrExperiments.size(); ++i) 01201 OpenString(istm, m_vecstrExperiments[i]); 01202 01203 istm.read((char*) &iTmp, sizeof(iTmp)); 01204 m_vecstrGenes.resize(iTmp); 01205 for (i = 0; i < m_vecstrGenes.size(); ++i) { 01206 OpenString(istm, m_vecstrGenes[i]); 01207 m_mapstriGenes[m_vecstrGenes[i]] = i; 01208 } 01209 01210 m_Data.Initialize(GetGenes(), GetExperiments()); 01211 for (i = 0; i < m_Data.GetRows(); ++i) 01212 istm.read((char*) m_Data.Get(i), GetExperiments() * sizeof(*m_Data.Get( 01213 i))); 01214 01215 return true; 01216 } 01217 01236 void CPCL::Open(const std::vector<std::string>& vecstrGenes, const std::vector< 01237 std::string>& vecstrExperiments, 01238 const std::vector<std::string>& vecstrFeatures) { 01239 size_t i, j; 01240 01241 Reset(); 01242 if (vecstrFeatures.empty()) 01243 m_vecstrFeatures.push_back("GID"); 01244 else { 01245 m_vecstrFeatures.resize(vecstrFeatures.size()); 01246 copy(vecstrFeatures.begin(), vecstrFeatures.end(), 01247 m_vecstrFeatures.begin()); 01248 m_vecvecstrFeatures.resize(m_vecstrFeatures.size() - 1); 01249 for (i = 0; i < m_vecvecstrFeatures.size(); ++i) 01250 m_vecvecstrFeatures[i].resize(vecstrGenes.size()); 01251 } 01252 01253 m_vecstrGenes.resize(vecstrGenes.size()); 01254 for (i = 0; i < GetGenes(); ++i) 01255 SetGene(i, vecstrGenes[i]); 01256 m_vecstrExperiments.resize(vecstrExperiments.size()); 01257 for (i = 0; i < GetExperiments(); ++i) 01258 SetExperiment(i, vecstrExperiments[i]); 01259 01260 m_Data.Initialize(GetGenes(), GetExperiments()); 01261 for (i = 0; i < m_Data.GetRows(); ++i) 01262 for (j = 0; j < m_Data.GetColumns(); ++j) 01263 m_Data.Set(i, j, CMeta::GetNaN()); 01264 } 01265 01291 void CPCL::Open(const std::vector<size_t>& veciGenes, const std::vector< 01292 std::string>& vecstrGenes, 01293 const std::vector<std::string>& vecstrExperiments) { 01294 size_t i, j; 01295 char ac[16]; 01296 01297 Reset(); 01298 m_vecstrFeatures.resize(4); 01299 m_vecstrFeatures[0] = "GID"; 01300 m_vecstrFeatures[1] = "YORF"; 01301 m_vecstrFeatures[2] = "NAME"; 01302 m_vecstrFeatures[3] = "GWEIGHT"; 01303 m_vecvecstrFeatures.resize(m_vecstrFeatures.size() - 1); 01304 for (i = 0; i < m_vecvecstrFeatures.size(); ++i) 01305 m_vecvecstrFeatures[i].resize(veciGenes.size()); 01306 for (i = 0; i < veciGenes.size(); ++i) { 01307 m_vecvecstrFeatures[0][i] = m_vecvecstrFeatures[1][i] 01308 = vecstrGenes[veciGenes[i]]; 01309 m_vecvecstrFeatures[2][i] = "1"; 01310 } 01311 01312 m_vecstrGenes.resize(vecstrGenes.size()); 01313 for (i = 0; i < m_vecstrGenes.size(); ++i) { 01314 m_vecstrGenes[i] = "GENE"; 01315 sprintf_s(ac, "%d", veciGenes[i]); 01316 m_vecstrGenes[i] += ac; 01317 m_mapstriGenes[m_vecstrGenes[i]] = i; 01318 } 01319 m_vecstrExperiments.resize(vecstrExperiments.size()); 01320 for (i = 0; i < m_vecstrExperiments.size(); ++i) 01321 m_vecstrExperiments[i] = vecstrExperiments[i]; 01322 01323 m_Data.Initialize(GetGenes(), GetExperiments()); 01324 for (i = 0; i < m_Data.GetRows(); ++i) 01325 for (j = 0; j < m_Data.GetColumns(); ++j) 01326 m_Data.Set(i, j, CMeta::GetNaN()); 01327 } 01328 01342 bool CPCL::SortGenes(const vector<size_t>& veciOrder) { 01343 size_t i; 01344 01345 if (veciOrder.size() != m_Data.GetRows()) 01346 return false; 01347 01348 CMeta::Permute(m_Data.Get(), veciOrder); 01349 CMeta::Permute(m_vecstrGenes, veciOrder); 01350 for (i = 0; i < GetGenes(); ++i) 01351 m_mapstriGenes[GetGene(i)] = i; 01352 for (i = 0; i < m_vecvecstrFeatures.size(); ++i) 01353 CMeta::Permute(m_vecvecstrFeatures[i], veciOrder); 01354 01355 return true; 01356 } 01357 01369 void CPCL::RankTransform() { 01370 size_t i, j, k; 01371 vector<size_t> veciRanks, veciCounts; 01372 01373 veciRanks.resize(GetExperiments()); 01374 veciCounts.resize(GetExperiments()); 01375 for (i = 0; i < GetGenes(); ++i) { 01376 fill(veciRanks.begin(), veciRanks.end(), 0); 01377 for (j = 0; j < GetExperiments(); ++j) { 01378 if (CMeta::IsNaN(Get(i, j))) 01379 continue; 01380 for (k = 0; k < GetExperiments(); ++k) { 01381 if (CMeta::IsNaN(Get(i, k))) 01382 continue; 01383 if ((j != k) && (Get(i, k) < Get(i, j))) 01384 veciRanks[j]++; 01385 } 01386 } 01387 01388 fill(veciCounts.begin(), veciCounts.end(), 0); 01389 for (j = 0; j < GetExperiments(); ++j) 01390 if (!CMeta::IsNaN(Get(i, j))) 01391 veciCounts[veciRanks[j]]++; 01392 01393 for (j = 0; j < GetExperiments(); ++j) 01394 if (!CMeta::IsNaN(Get(i, j))) { 01395 k = veciRanks[j]; 01396 // Closed form for sum(rank, rank + n) / n 01397 Set(i, j, k + ((veciCounts[k] + 1) / 2.0f)); 01398 } 01399 } 01400 } 01401 01415 bool CPCL::AddGenes(const std::vector<std::string>& vecstrGenes) { 01416 size_t i, j, iStart; 01417 01418 iStart = m_Data.GetRows(); 01419 if (!m_Data.AddRows(vecstrGenes.size())) 01420 return false; 01421 for (i = iStart; i < m_Data.GetRows(); ++i) 01422 for (j = 0; j < m_Data.GetColumns(); ++j) 01423 m_Data.Set(i, j, CMeta::GetNaN()); 01424 01425 m_vecstrGenes.resize(m_vecstrGenes.size() + vecstrGenes.size()); 01426 for (i = 0; i < vecstrGenes.size(); ++i) 01427 SetGene(iStart + i, vecstrGenes[i]); 01428 for (i = 0; i < m_vecvecstrFeatures.size(); ++i) { 01429 m_vecvecstrFeatures[i].resize(m_vecvecstrFeatures[i].size() 01430 + vecstrGenes.size()); 01431 if (m_vecstrFeatures[i + 1] == c_szNAME) 01432 for (j = 0; j < vecstrGenes.size(); ++j) 01433 m_vecvecstrFeatures[i][iStart + j] = vecstrGenes[j]; 01434 else if (m_vecstrFeatures[i + 1] == c_szGWEIGHT) 01435 for (j = 0; j < vecstrGenes.size(); ++j) 01436 m_vecvecstrFeatures[i][iStart + j] = c_szOne; 01437 } 01438 01439 return true; 01440 } 01441 01452 void CPCL::Normalize(ENormalize eNormalize) { 01453 size_t i, j, iCount; 01454 double dAve, dStd; 01455 float d, dMin, dMax; 01456 01457 switch (eNormalize) { 01458 case ENormalizeMean: 01459 dMin = FLT_MAX; 01460 dAve = 0; 01461 for (iCount = i = 0; i < GetGenes(); ++i) 01462 for (j = 0; j < GetExperiments(); ++j) 01463 if (!CMeta::IsNaN(d = Get(i, j))) { 01464 if (d < dMin) 01465 dMin = d; 01466 iCount++; 01467 dAve += d; 01468 } 01469 if (!iCount) 01470 break; 01471 dAve = (dAve / iCount) - dMin; 01472 for (i = 0; i < GetGenes(); ++i) 01473 for (j = 0; j < GetExperiments(); ++j) 01474 if (!CMeta::IsNaN(d = Get(i, j))) 01475 Set(i, j, (float) ((d - dMin) / dAve)); 01476 break; 01477 01478 case ENormalizeZScore: 01479 dAve = dStd = 0; 01480 for (iCount = i = 0; i < GetGenes(); ++i) 01481 for (j = 0; j < GetExperiments(); ++j) 01482 if (!CMeta::IsNaN(d = Get(i, j))) { 01483 iCount++; 01484 dAve += d; 01485 dStd += d * d; 01486 } 01487 dAve /= iCount; 01488 dStd = sqrt((dStd / iCount) - (dAve * dAve)); 01489 for (i = 0; i < GetGenes(); ++i) 01490 for (j = 0; j < GetExperiments(); ++j) 01491 if (!CMeta::IsNaN(d = Get(i, j))) 01492 Set(i, j, (float) ((d - dAve) / dStd)); 01493 break; 01494 01495 case ENormalizeRow: 01496 for (i = 0; i < GetGenes(); ++i) { 01497 dAve = CStatistics::Average(Get(i), Get(i) + GetExperiments()); 01498 dStd = sqrt(CStatistics::Variance(Get(i), 01499 Get(i) + GetExperiments(), dAve)); 01500 if (dStd) 01501 for (j = 0; j < GetExperiments(); ++j) 01502 Set(i, j, (float) ((Get(i, j) - dAve) / dStd)); 01503 } 01504 break; 01505 01506 case ENormalizeColumn: 01507 case ENormalizeColumnCenter: 01508 case ENormalizeColumnFraction: 01509 for (i = 0; i < GetExperiments(); ++i) { 01510 dAve = dStd = 0; 01511 for (iCount = j = 0; j < GetGenes(); ++j) 01512 if (!CMeta::IsNaN(d = Get(j, i))) { 01513 iCount++; 01514 dAve += d; 01515 dStd += d * d; 01516 } 01517 if (iCount) { 01518 if (eNormalize != ENormalizeColumnFraction) { 01519 dAve /= iCount; 01520 dStd = (dStd / iCount) - (dAve * dAve); 01521 dStd = (dStd <= 0) ? 1 : sqrt(dStd); 01522 if (eNormalize == ENormalizeColumnCenter) 01523 dStd = 1; 01524 } 01525 for (j = 0; j < GetGenes(); ++j) 01526 if (!CMeta::IsNaN(d = Get(j, i))) { 01527 if (eNormalize == ENormalizeColumnFraction) 01528 d /= (float) dAve; 01529 else 01530 d = (float) ((d - dAve) / dStd); 01531 Set(j, i, d); 01532 } 01533 } 01534 } 01535 break; 01536 01537 case ENormalizeMinMax: 01538 dMin = FLT_MAX; 01539 dMax = -dMin; 01540 for (i = 0; i < GetGenes(); ++i) 01541 for (j = 0; j < GetExperiments(); ++j) 01542 if (!CMeta::IsNaN(d = Get(i, j))) { 01543 if (d < dMin) 01544 dMin = d; 01545 if (d > dMax) 01546 dMax = d; 01547 } 01548 if (!(dMax -= dMin)) 01549 dMax = 1; 01550 for (i = 0; i < GetGenes(); ++i) 01551 for (j = 0; j < GetExperiments(); ++j) 01552 if (!CMeta::IsNaN(d = Get(i, j))) 01553 Set(i, j, (d - dMin) / dMax); 01554 break; 01555 case EMeanSubtractColumn: 01556 for (i = 0; i < GetExperiments(); ++i) { 01557 dAve = 0; 01558 for (iCount = j = 0; j < GetGenes(); ++j) 01559 if (!CMeta::IsNaN(d = Get(j, i))) { 01560 iCount++; 01561 dAve += d; 01562 01563 } 01564 01565 if (iCount) { 01566 dAve /= iCount; 01567 for (j = 0; j < GetGenes(); ++j) 01568 if (!CMeta::IsNaN(d = Get(j, i))) 01569 Set(j, i, (float) (d - dAve)); 01570 } 01571 } 01572 break; 01573 } 01574 } 01575 01600 void CPCL::Impute(size_t iNeighbors, float dMinimumPresent, 01601 const CDat& DatSimilarity) { 01602 vector<float> vecdMissing; 01603 vector<size_t> veciMissing; 01604 vector<SNeighbors> vecsNeighbors; 01605 size_t i, j, k, iCol, iMissing, iOne, iTwo; 01606 float d, dSum; 01607 const float* ad; 01608 01609 vecdMissing.resize(GetGenes()); 01610 for (i = 0; i < vecdMissing.size(); ++i) { 01611 for (iMissing = j = 0; j < GetExperiments(); ++j) 01612 if (CMeta::IsNaN(Get(i, j))) 01613 iMissing++; 01614 if ((vecdMissing[i] = (float) (GetExperiments() - iMissing) 01615 / GetExperiments()) >= dMinimumPresent) 01616 veciMissing.push_back(i); 01617 } 01618 if (iNeighbors) { 01619 vecsNeighbors.resize(veciMissing.size()); 01620 for (i = 0; i < vecsNeighbors.size(); ++i) 01621 vecsNeighbors[i].Initialize(Get(veciMissing[i]), GetExperiments(), 01622 iNeighbors); 01623 for (i = 0; i < veciMissing.size(); ++i) { 01624 if (!(i % 100)) 01625 g_CatSleipnir().info( 01626 "CPCL::Impute( %d, %g ) finding neighbors for gene %d/%d", 01627 iNeighbors, dMinimumPresent, i, veciMissing.size()); 01628 ad = Get(iOne = veciMissing[i]); 01629 for (j = (i + 1); j < veciMissing.size(); ++j) { 01630 d = DatSimilarity.Get(iOne, iTwo = veciMissing[j]); 01631 vecsNeighbors[i].Add(iTwo, d, Get(iTwo)); 01632 vecsNeighbors[j].Add(iOne, d, ad); 01633 } 01634 } 01635 } 01636 01637 for (iOne = i = 0; i < GetGenes(); ++i) { 01638 if (vecdMissing[i] < dMinimumPresent) { 01639 MaskGene(i); 01640 continue; 01641 } 01642 if (vecsNeighbors.empty()) 01643 continue; 01644 { 01645 const SNeighbors& sGene = vecsNeighbors[iOne++]; 01646 01647 for (j = 0; j < GetExperiments(); ++j) { 01648 if (!CMeta::IsNaN(Get(i, j))) 01649 continue; 01650 01651 iCol = sGene.GetColumn(j); 01652 for (d = dSum = 0, iMissing = k = 0; k < iNeighbors; ++k) { 01653 const pair<size_t, float>& prNeighbor = 01654 sGene.m_MatNeighbors.Get(k, iCol); 01655 01656 if (prNeighbor.second == -FLT_MAX) 01657 continue; 01658 iMissing++; 01659 dSum += prNeighbor.second; 01660 d += Get(prNeighbor.first, j) * prNeighbor.second; 01661 } 01662 if (dSum) 01663 d /= dSum; 01664 Set(i, j, iMissing ? d : CMeta::GetNaN()); 01665 } 01666 } 01667 } 01668 } 01669 01701 void CPCL::Impute(size_t iNeighbors, float dMinimumPresent, 01702 const IMeasure* pMeasure, bool fPrecompute) { 01703 CDat Dat; 01704 size_t i, j; 01705 01706 if (fPrecompute) { 01707 Dat.Open(GetGeneNames()); 01708 for (i = 0; i < Dat.GetGenes(); ++i) 01709 for (j = (i + 1); j < Dat.GetGenes(); ++j) 01710 Dat.Set(i, j, (float) pMeasure->Measure(Get(i), 01711 GetExperiments(), Get(j), GetExperiments())); 01712 } else 01713 Dat.Open(*this, pMeasure, false); 01714 Impute(iNeighbors, dMinimumPresent, Dat); 01715 } 01716 01733 void CPCL::MedianMultiples(size_t iSample, size_t iBins, float dBinSize) { 01734 size_t i, j, k, iBin, iOne, iCutoff; 01735 CMeasureEuclidean Euclidean; 01736 vector<float> vecdEuclidean, vecdMapped, vecdBG, vecdFG, vecdMean; 01737 float d, dAve, dStd, dSample; 01738 vector<vector<size_t> > vecveciGenes; 01739 vector<size_t> veciMean; 01740 vector<string> vecstrAdd; 01741 01742 { 01743 map<string, size_t> mapstriClasses; 01744 map<string, size_t>::iterator iterClass; 01745 01746 for (i = 0; i < m_vecstrGenes.size(); ++i) { 01747 const string& strGene = m_vecstrGenes[i]; 01748 01749 if ((iterClass = mapstriClasses.find(strGene)) 01750 == mapstriClasses.end()) { 01751 mapstriClasses[strGene] = vecveciGenes.size(); 01752 vecveciGenes.push_back(vector<size_t> ()); 01753 vecveciGenes.back().push_back(i); 01754 } else 01755 vecveciGenes[iterClass->second].push_back(i); 01756 } 01757 } 01758 01759 dSample = 2.0f * iSample / GetGenes() / (GetGenes() - 1); 01760 dAve = dStd = 0; 01761 for (k = i = 0; i < GetGenes(); ++i) 01762 for (j = (i + 1); j < GetGenes(); ++j) { 01763 if (((float) rand() / RAND_MAX) >= dSample) 01764 continue; 01765 vecdEuclidean.push_back(d = (float) Euclidean.Measure(Get(i), 01766 GetExperiments(), Get(j), GetExperiments(), 01767 IMeasure::EMapNone)); 01768 k++; 01769 dAve += d; 01770 dStd += d * d; 01771 } 01772 dAve /= k; 01773 dStd = sqrt((dStd / (k - 1)) - (dAve * dAve)); 01774 if (!(iBins % 2)) 01775 iBins++; 01776 01777 vecdBG.resize(iBins); 01778 for (i = 0; i < vecdEuclidean.size(); ++i) 01779 vecdBG[MedianMultiplesBin(vecdEuclidean[i], dAve, dStd, iBins, dBinSize)]++; 01780 for (i = 0; i < vecdBG.size(); ++i) 01781 // vecdBG[i] /= vecdEuclidean.size( ); 01782 vecdBG[i] = (vecdBG[i] + 1) / (vecdEuclidean.size() + vecdBG.size()); 01783 //* 01784 MedianMultiplesSmooth(2, vecdBG); 01785 //*/ 01786 01787 MedianMultiplesMapped(vecveciGenes, vecdMapped); 01788 vecdFG.resize(iBins); 01789 for (i = 0; i < vecdMapped.size(); ++i) 01790 vecdFG[MedianMultiplesBin(vecdMapped[i], dAve, dStd, iBins, dBinSize)]++; 01791 /* 01792 dMax = 0; 01793 for( iMax = i = 0; i < vecdBG.size( ); ++i ) 01794 if( vecdBG[i] > dMax ) { 01795 dMax = vecdBG[i]; 01796 iMax = i; } 01797 for( i = 0; i <= iMax; ++i ) 01798 vecdFG[i]++; 01799 for( i = 0; i < vecdFG.size( ); ++i ) 01800 vecdFG[i] /= vecdMapped.size( ) + iMax + 1; 01801 //*/ 01802 //*/ 01803 for (i = 0; i < vecdFG.size(); ++i) 01804 // vecdFG[i] /= vecdMapped.size( ); 01805 vecdFG[i] = (vecdFG[i] + 1) / (vecdMapped.size() + vecdFG.size()); 01806 MedianMultiplesSmooth(2, vecdFG); 01807 /* 01808 for( i = 0; i < vecdBG.size( ); ++i ) 01809 cerr << vecdBG[i] << '\t'; 01810 cerr << endl; 01811 for( i = 0; i < vecdFG.size( ); ++i ) 01812 cerr << vecdFG[i] << '\t'; 01813 cerr << endl; 01814 //*/ 01815 for (iCutoff = 0; iCutoff < vecdFG.size(); ++iCutoff) 01816 if (vecdFG[iCutoff] < vecdBG[iCutoff]) 01817 break; 01818 01819 vecstrAdd.resize(1); 01820 veciMean.resize(GetExperiments()); 01821 vecdMean.resize(GetExperiments()); 01822 for (i = 0; i < vecveciGenes.size(); ++i) { 01823 const vector<size_t>& veciGenes = vecveciGenes[i]; 01824 vector<size_t> veciAgree; 01825 01826 if (veciGenes.size() < 2) 01827 continue; 01828 veciAgree.resize(veciGenes.size()); 01829 for (j = 0; j < veciGenes.size(); ++j) { 01830 iOne = veciGenes[j]; 01831 for (k = (j + 1); k < veciGenes.size(); ++k) { 01832 iBin = MedianMultiplesBin((float) Euclidean.Measure(Get(iOne), 01833 GetExperiments(), Get(veciGenes[k]), GetExperiments(), 01834 IMeasure::EMapNone), dAve, dStd, iBins, dBinSize); 01835 // if( iBin <= iCutoff ) { 01836 if ((1.01 * vecdFG[iBin]) >= vecdBG[iBin]) { 01837 veciAgree[j]++; 01838 veciAgree[k]++; 01839 } 01840 } 01841 } 01842 01843 fill(vecdMean.begin(), vecdMean.end(), 0.0f); 01844 fill(veciMean.begin(), veciMean.end(), 0); 01845 for (iBin = j = 0; j < veciGenes.size(); ++j) 01846 if ((2 * veciAgree[j]) >= (veciAgree.size() - 1)) { 01847 iBin++; 01848 iOne = veciGenes[j]; 01849 for (k = 0; k < GetExperiments(); ++k) 01850 if (!CMeta::IsNaN(d = Get(iOne, k))) { 01851 veciMean[k]++; 01852 vecdMean[k] += d; 01853 } 01854 } 01855 01856 for (j = 0; j < veciGenes.size(); ++j) 01857 MaskGene(veciGenes[j]); 01858 if ((iBin * 2) > veciAgree.size()) { 01859 for (j = 0; j < vecdMean.size(); ++j) 01860 vecdMean[j] = veciMean[j] ? (vecdMean[j] / veciMean[j]) 01861 : CMeta::GetNaN(); 01862 vecstrAdd[0] = GetGene(veciGenes[0]); 01863 AddGenes(vecstrAdd); 01864 Set(GetGenes() - 1, &vecdMean[0]); 01865 } 01866 } 01867 } 01868 01869 void CPCLImpl::MedianMultiplesMapped( 01870 const vector<vector<size_t> >& vecveciGenes, vector<float>& vecdMapped) { 01871 size_t i, j, k; 01872 CMeasureEuclidean Euclidean; 01873 const float* adOne; 01874 01875 for (i = 0; i < vecveciGenes.size(); ++i) { 01876 const vector<size_t>& veciGenes = vecveciGenes[i]; 01877 01878 if (veciGenes.size() < 2) 01879 continue; 01880 for (j = 0; j < veciGenes.size(); ++j) { 01881 adOne = m_Data.Get(veciGenes[j]); 01882 for (k = (j + 1); k < veciGenes.size(); ++k) 01883 vecdMapped.push_back((float) Euclidean.Measure(adOne, 01884 m_Data.GetColumns(), m_Data.Get(veciGenes[k]), 01885 m_Data.GetColumns(), IMeasure::EMapNone)); 01886 } 01887 } 01888 } 01889 01895 bool CPCL::populate(const char* szFile, float dDefault){ 01896 ifstream istm; 01897 const char* pc; 01898 char* pcTail; 01899 char* acBuf; 01900 string strToken, strCache, strValue; 01901 size_t iOne, iTwo, i; 01902 float dScore; 01903 01904 istm.open( szFile ); 01905 01906 acBuf = new char[ c_iBufferSize ]; 01907 while( istm.peek( ) != EOF ) { 01908 istm.getline( acBuf, c_iBufferSize - 1 ); 01909 strToken = OpenToken( acBuf, &pc ); 01910 if( !strToken.length( ) ) 01911 break; 01912 //if( strToken == c_acComment ) 01913 // continue; 01914 if( strToken != strCache ) { 01915 strCache = strToken; 01916 iOne = GetGene( strToken ); 01917 } 01918 01919 strToken = OpenToken( pc, &pc ); 01920 if( !strToken.length( ) ) { 01921 delete[] acBuf; 01922 return false; } 01923 iTwo = GetGene( strToken ); 01924 01925 strValue = OpenToken( pc ); 01926 if( !strValue.length( ) ) { 01927 if( CMeta::IsNaN( dScore = dDefault ) ) { 01928 delete[] acBuf; 01929 return false; } } 01930 else if( !( dScore = (float)strtod( strValue.c_str( ), &pcTail ) ) && 01931 ( pcTail != ( strValue.c_str( ) + strValue.length( ) ) ) ) { 01932 delete[] acBuf; 01933 return false; } 01934 01935 Set(iOne, iTwo, dScore); 01936 01937 } 01938 delete[] acBuf; 01939 01940 return true; 01941 } 01942 01943 }