Sleipnir
src/pcl.cpp
00001 /*****************************************************************************
00002  * This file is provided under the Creative Commons Attribution 3.0 license.
00003  *
00004  * You are free to share, copy, distribute, transmit, or adapt this work
00005  * PROVIDED THAT you attribute the work to the authors listed below.
00006  * For more information, please see the following web page:
00007  * http://creativecommons.org/licenses/by/3.0/
00008  *
00009  * This file is a component of the Sleipnir library for functional genomics,
00010  * authored by:
00011  * Curtis Huttenhower (chuttenh@princeton.edu)
00012  * Mark Schroeder
00013  * Maria D. Chikina
00014  * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015  *
00016  * If you use this library, the included executable tools, or any related
00017  * code in your work, please cite the following publication:
00018  * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019  * Olga G. Troyanskaya.
00020  * "The Sleipnir library for computational functional genomics"
00021  *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "pcl.h"
00024 #include "meta.h"
00025 #include "statistics.h"
00026 #include "genome.h"
00027 #include "measure.h"
00028 #include "dat.h"
00029 
00030 namespace Sleipnir {
00031 
00032 const char CPCLImpl::c_szEWEIGHT[] = "EWEIGHT";
00033 const char CPCLImpl::c_szGENE[] = "GENE";
00034 const char CPCLImpl::c_szGID[] = "GID";
00035 const char CPCLImpl::c_szGWEIGHT[] = "GWEIGHT";
00036 const char CPCLImpl::c_szNAME[] = "NAME";
00037 const char CPCLImpl::c_szOne[] = "1";
00038 const char CPCLImpl::c_szExtension[] = ".pcl";
00039 const char CPCLImpl::c_szBinExtension[] = ".bin";
00040 const char CPCLImpl::c_szDabExtension[] = ".dab";
00041 
00042 struct SNeighbors {
00043     CFullMatrix<pair<size_t, float> > m_MatNeighbors;
00044     vector<size_t> m_veciMin;
00045     vector<size_t> m_veciColumns;
00046 
00047     void Initialize(const float* adValues, size_t iValues, size_t iNeighbors) {
00048         size_t i, j;
00049 
00050         m_veciColumns.clear();
00051         for (i = 0; i < iValues; ++i)
00052             if (CMeta::IsNaN(adValues[i]))
00053                 m_veciColumns.push_back(i);
00054 
00055         m_veciMin.resize(m_veciColumns.size());
00056         m_MatNeighbors.Initialize(iNeighbors, m_veciColumns.size());
00057         for (i = 0; i < m_MatNeighbors.GetRows(); ++i)
00058             for (j = 0; j < m_MatNeighbors.GetColumns(); ++j)
00059                 m_MatNeighbors.Get(i, j).second = -FLT_MAX;
00060     }
00061 
00062     bool Add(size_t iNeighbor, float dSim, const float* adValues) {
00063         size_t i, j, iCol;
00064         bool fRet;
00065 
00066         if (!m_MatNeighbors.GetRows())
00067             return false;
00068         for (fRet = false, i = 0; i < m_veciColumns.size(); ++i) {
00069             iCol = m_veciColumns[i];
00070             if (!CMeta::IsNaN(adValues[iCol]) && (dSim > m_MatNeighbors.Get(
00071                     m_veciMin[i], i).second)) {
00072                 fRet = true;
00073                 m_MatNeighbors.Get(m_veciMin[i], i).first = iNeighbor;
00074                 m_MatNeighbors.Get(m_veciMin[i], i).second = dSim;
00075 
00076                 for (m_veciMin[i] = 0, j = 1; j < m_MatNeighbors.GetRows(); ++j)
00077                     if (m_MatNeighbors.Get(j, i).second < m_MatNeighbors.Get(
00078                             m_veciMin[i], i).second)
00079                         m_veciMin[i] = j;
00080             }
00081         }
00082 
00083         return fRet;
00084     }
00085 
00086     size_t GetColumn(size_t iColumn) const {
00087         size_t i;
00088 
00089         for (i = 0; i < m_veciColumns.size(); ++i)
00090             if (m_veciColumns[i] == iColumn)
00091                 return i;
00092 
00093         return -1;
00094     }
00095 };
00096 
00165 int CPCL::Distance(const char* szFile, size_t iSkip,
00166         const char* szSimilarityMeasure, bool fNormalize, bool fZScore,
00167         bool fAutocorrelate, const char* szGeneFile, float dCutoff,
00168         size_t iLimit, CPCL& PCL, CDat& Dat, IMeasure::EMap eMap,
00169         bool fFrequencyWeight, float dAlpha, int nThreads) {
00170 
00171 
00172     size_t i, j, iOne, iTwo;
00173     float d;
00174     ifstream ifsm;
00175     vector<string> vecstrGenes;
00176     CGenome Genome;
00177     CGenes GenesIn(Genome);
00178     vector<size_t> veciGenes;
00179     const float* adOne;
00180     const float* adWeights;
00181     vector<float> vecdWeights;
00182     IMeasure* pMeasure;
00183     CMeasurePearson Pearson;
00184     CMeasureEuclidean Euclidean;
00185     CMeasureKendallsTau KendallsTau;
00186     CMeasureKolmogorovSmirnov KolmSmir;
00187     CMeasureSpearman Spearman(true);
00188     CMeasurePearNorm PearNorm;
00189     CMeasureHypergeometric Hypergeom;
00190     CMeasureQuickPearson PearQuick;
00191     CMeasureInnerProduct InnerProd;
00192     CMeasureBinaryInnerProduct BinInnerProd;
00193     CMeasureMutualInformation MutualInfo;
00194     CMeasureRelativeAUC RelAuc;
00195     CMeasurePearsonSignificance PearSig;
00196     CMeasureDice Dice( dAlpha );
00197     CMeasureDistanceCorrelation DCor;
00198     CMeasureSignedDistanceCorrelation SDCor;
00199     if (szFile) {
00200         g_CatSleipnir().debug("Opening PCL for distance");
00201         if (!PCL.Open(szFile, iSkip, false, false)) {
00202             g_CatSleipnir().error(
00203                     "CPCL::Distance( %s, %d, %s, %d, %d, %d, %s, %g ) failed to open PCL",
00204                     szFile, iSkip, szSimilarityMeasure, fNormalize, fZScore,
00205                     fAutocorrelate, szGeneFile ? szGeneFile : "", dCutoff);
00206             return 1;
00207         }
00208         ifsm.close();
00209     } else if (!PCL.Open(cin, iSkip)) {
00210         g_CatSleipnir().error(
00211                 "CPCL::Distance( %s, %d, %s, %d, %d, %d, %s, %g ) failed to open PCL",
00212                 "stdin", iSkip, szSimilarityMeasure, fNormalize, fZScore,
00213                 fAutocorrelate, szGeneFile ? szGeneFile : "", dCutoff);
00214         return 1;
00215     }
00216     if (!szSimilarityMeasure)
00217         return 0;
00218 
00219     CMeasureSigmoid
00220             EuclideanSig(&Euclidean, false, 1.0f / PCL.GetExperiments());
00221     IMeasure* apMeasures[] = { &Pearson, &EuclideanSig, &KendallsTau,
00222             &KolmSmir, &Spearman, &PearNorm, &Hypergeom, &PearQuick,
00223             &InnerProd, &BinInnerProd, &MutualInfo, &RelAuc, &PearSig, &Dice,&DCor,&SDCor, NULL };
00224 
00225     pMeasure = NULL;
00226     for (i = 0; apMeasures[i]; ++i)
00227         if (!strcmp(apMeasures[i]->GetName(), szSimilarityMeasure)) {
00228             pMeasure = apMeasures[i];
00229             break;
00230         }
00231     if (!pMeasure)
00232         return 1;
00233     g_CatSleipnir().info("Method: %s ", pMeasure->GetName(), i);
00234     if (fFrequencyWeight) {
00235         vecdWeights.resize(PCL.GetExperiments());
00236         for (i = 0; i < vecdWeights.size(); ++i) {
00237             for (iOne = j = 0; j < PCL.GetGenes(); ++j)
00238                 if (!CMeta::IsNaN(d = PCL.Get(j, i)) && (d > 0))
00239                     iOne++;
00240             vecdWeights[i] = (float) ((PCL.GetGenes() + 1) - iOne)
00241                     / PCL.GetGenes();
00242         }
00243     }
00244     adWeights = vecdWeights.empty() ? NULL : &vecdWeights[0];
00245 
00246     CMeasureAutocorrelate Autocorrelate(pMeasure, false);
00247     if (fAutocorrelate)
00248         pMeasure = &Autocorrelate;
00249 
00250     if (szGeneFile) {
00251         ifsm.clear();
00252         ifsm.open(szGeneFile);
00253         if (!GenesIn.Open(ifsm)) {
00254             g_CatSleipnir().error(
00255                     "CPCL::Distance( %s, %d, %s, %d, %d, %d, %s, %g ) failed to open genes",
00256                     szFile ? szFile : "stdin", iSkip, szSimilarityMeasure,
00257                     fNormalize, fZScore, fAutocorrelate, szGeneFile, dCutoff);
00258             return 1;
00259         }
00260         ifsm.close();
00261     } else
00262         GenesIn.Open(PCL.GetGeneNames());
00263     veciGenes.resize(GenesIn.GetGenes());
00264     for (i = 0; i < veciGenes.size(); ++i)
00265         veciGenes[i] = szGeneFile ? PCL.GetGene(GenesIn.GetGene(i).GetName())
00266                 : i;
00267 
00268     if (pMeasure->IsRank())
00269         PCL.RankTransform();
00270 
00271     g_CatSleipnir().info("Number of Experiments: %d", PCL.GetExperiments());
00272 
00273     if ((iLimit != -1) && (PCL.GetGenes() > iLimit))
00274         Dat.Open(PCL, pMeasure->Clone(), true);
00275     else {
00276         Dat.Open(GenesIn.GetGeneNames());
00277         for (i = 0; i < Dat.GetGenes(); ++i)
00278             for (j = (i + 1); j < Dat.GetGenes(); ++j)
00279                 Dat.Set(i, j, CMeta::GetNaN());
00280         int origNThreads = omp_get_num_threads();
00281         omp_set_num_threads(nThreads);
00282         for (i = 0; i < GenesIn.GetGenes(); ++i) {
00283             if (!(i % 100))
00284                 g_CatSleipnir().info(
00285                         "CPCL::Distance( %s, %d, %s, %d, %d, %d, %s, %g ) processing gene %d/%d",
00286                         szFile ? szFile : "stdin", iSkip, szSimilarityMeasure,
00287                         fNormalize, fZScore, fAutocorrelate,
00288                         szGeneFile ? szGeneFile : "", dCutoff, i,
00289                         GenesIn.GetGenes());
00290             if ((iOne = veciGenes[i]) == -1)
00291                 continue;
00292             adOne = PCL.Get(iOne);
00293             for (j = (i + 1); j < GenesIn.GetGenes(); ++j){
00294                 if ((iTwo = veciGenes[j]) != -1){
00295                     Dat.Set(i, j, (float) pMeasure->Measure(adOne,
00296                             PCL.GetExperiments(), PCL.Get(iTwo),
00297                             PCL.GetExperiments(), eMap, adWeights, adWeights));
00298                 }
00299             }
00300         }
00301         omp_set_num_threads(origNThreads);
00302         if (fNormalize || fZScore)
00303             Dat.Normalize(fZScore ? CDat::ENormalizeZScore
00304                     : CDat::ENormalizeMinMax);
00305         if (!CMeta::IsNaN(dCutoff))
00306             for (i = 0; i < Dat.GetGenes(); ++i)
00307                 for (j = (i + 1); j < Dat.GetGenes(); ++j)
00308                     if (!CMeta::IsNaN(d = Dat.Get(i, j)) && (d < dCutoff))
00309                         Dat.Set(i, j, CMeta::GetNaN());
00310 
00311         if(pMeasure->GetName()=="pearson" || pMeasure->GetName()=="pearnorm"){
00312             vector<unsigned long> bins;
00313             bins.resize(55);
00314             float upper = 0;
00315             float lower = 0;
00316             if(pMeasure->GetName()=="pearson"){
00317                 upper = 1.0;
00318                 lower = -1.0;
00319             }else if(pMeasure->GetName()=="pearnorm"){
00320                 upper = 5.0;
00321                 lower = -5.0;
00322             }
00323             float bin_size = (upper - lower) / 50;
00324             for(i=0; i<55; i++)
00325                 bins[i] = 0;
00326             for(i=0; i<Dat.GetGenes(); i++){
00327                 for(j=i+1; j<Dat.GetGenes(); j++){
00328                     d = Dat.Get(i,j);
00329                     if(CMeta::IsNaN(d)) continue;
00330                     int b = (int) ((d - lower) / bin_size);
00331                     if(b<0){
00332                         bins[0]++;
00333                         continue;
00334                     }
00335                     if(b>=55){
00336                         bins[54]++;
00337                         continue;
00338                     }
00339                     bins[b]++;
00340                 }
00341             }
00342             g_CatSleipnir().info(
00343             "Distribution of distances: bin size: %.5f, number of bins: %d, min bin value: %.5f, max bin value: %.5f",
00344             bin_size, 55, lower, upper);
00345             for(i=0; i<55; i++){
00346                 g_CatSleipnir().info("%lu\t%lu", i, bins[i]);
00347             }
00348         }
00349 
00350     }
00351 
00352     return 0;
00353 }
00354 
00426 int CPCL::Distance(const char* szFile, size_t iSkip, const char* szWeights,
00427         const char* szSimilarityMeasure, bool fNormalize, bool fZScore,
00428         bool fAutocorrelate, const char* szGeneFile, float dCutoff,
00429         size_t iLimit, CPCL& PCL, CDat& Dat, IMeasure::EMap eMap,
00430         bool fFrequencyWeight, float dAlpha, int nThreads) {
00431     size_t i, j, iOne, iTwo;
00432     float d;
00433     ifstream ifsm;
00434     vector<string> vecstrGenes;
00435     CGenome Genome;
00436     CGenes GenesIn(Genome);
00437     vector<size_t> veciGenes;
00438     const float* adOne;
00439     const float* adWeights;
00440     vector<float> vecdWeights;
00441     vector<string> vecstrWeights;
00442     string acStr;
00443     IMeasure* pMeasure;
00444     CMeasurePearson Pearson;
00445     CMeasureEuclidean Euclidean;
00446     CMeasureKendallsTau KendallsTau;
00447     CMeasureKolmogorovSmirnov KolmSmir;
00448     CMeasureSpearman Spearman(true);
00449     CMeasurePearNorm PearNorm;
00450     CMeasureHypergeometric Hypergeom;
00451     CMeasureQuickPearson PearQuick;
00452     CMeasureInnerProduct InnerProd;
00453     CMeasureBinaryInnerProduct BinInnerProd;
00454     CMeasureMutualInformation MutualInfo;
00455     CMeasureRelativeAUC RelAuc;
00456     CMeasurePearsonSignificance PearSig;
00457     CMeasureDice Dice( dAlpha );
00458     CMeasureDistanceCorrelation DCor;
00459     CMeasureSignedDistanceCorrelation SDCor;
00460     if (szFile) {
00461         g_CatSleipnir().debug("Opening PCL for distance");
00462         if (!PCL.Open(szFile, iSkip, false, false)) {
00463             g_CatSleipnir().error(
00464                     "CPCL::Distance( %s, %d, %s, %d, %d, %d, %s, %g ) failed to open PCL",
00465                     szFile, iSkip, szSimilarityMeasure, fNormalize, fZScore,
00466                     fAutocorrelate, szGeneFile ? szGeneFile : "", dCutoff);
00467             return 1;
00468         }
00469         ifsm.close();
00470     } else if (!PCL.Open(cin, iSkip)) {
00471         g_CatSleipnir().error(
00472                 "CPCL::Distance( %s, %d, %s, %d, %d, %d, %s, %g ) failed to open PCL",
00473                 "stdin", iSkip, szSimilarityMeasure, fNormalize, fZScore,
00474                 fAutocorrelate, szGeneFile ? szGeneFile : "", dCutoff);
00475         return 1;
00476     }
00477     if (!szSimilarityMeasure)
00478         return 0;
00479 
00480     CMeasureSigmoid
00481             EuclideanSig(&Euclidean, false, 1.0f / PCL.GetExperiments());
00482     IMeasure* apMeasures[] = { &Pearson, &EuclideanSig, &KendallsTau,
00483             &KolmSmir, &Spearman, &PearNorm, &Hypergeom, &PearQuick,
00484             &InnerProd, &BinInnerProd, &MutualInfo, &RelAuc, &PearSig, &Dice,&DCor,&SDCor, NULL };
00485 
00486     pMeasure = NULL;
00487     for (i = 0; apMeasures[i]; ++i)
00488         if (!strcmp(apMeasures[i]->GetName(), szSimilarityMeasure)) {
00489             pMeasure = apMeasures[i];
00490             break;
00491         }
00492     if (!pMeasure)
00493         return 1;
00494     g_CatSleipnir().info("Method: %s ", pMeasure->GetName(), i);
00495     if (fFrequencyWeight) {
00496         vecdWeights.resize(PCL.GetExperiments());
00497         for (i = 0; i < vecdWeights.size(); ++i) {
00498             for (iOne = j = 0; j < PCL.GetGenes(); ++j)
00499                 if (!CMeta::IsNaN(d = PCL.Get(j, i)) && (d > 0))
00500                     iOne++;
00501             vecdWeights[i] = (float) ((PCL.GetGenes() + 1) - iOne)
00502                     / PCL.GetGenes();
00503         }
00504     }
00505     adWeights = vecdWeights.empty() ? NULL : &vecdWeights[0];
00506 
00507     if ( adWeights == NULL ) {
00508         if (szWeights != NULL) {
00509             ifsm.open(szWeights);
00510             getline(ifsm, acStr);
00511             ifsm.close();
00512             CMeta::Tokenize( acStr.c_str(), vecstrWeights, CMeta::c_szWS, true);
00513             if ( vecstrWeights.size() != PCL.GetExperiments()) {
00514                 g_CatSleipnir().error(
00515                         "CPCL::Distance( %s, %d, %s, %d, %d, %d, %s, %g ) length of weights file (%s, length: %d) did not match number of experiments (%d).",
00516                         szFile, iSkip, szSimilarityMeasure, fNormalize, fZScore,
00517                         fAutocorrelate, szGeneFile ? szGeneFile : "", dCutoff, szWeights,
00518                         vecstrWeights.size(), PCL.GetExperiments());
00519                 return 1;
00520             }
00521             vecdWeights.resize(PCL.GetExperiments());
00522             for (i = 0; i < vecstrWeights.size(); ++i) {
00523                 vecdWeights[i] = atof(vecstrWeights[i].c_str());
00524             }
00525             adWeights = vecdWeights.empty() ? NULL : &vecdWeights[0];
00526         }
00527     }
00528 
00529     CMeasureAutocorrelate Autocorrelate(pMeasure, false);
00530     if (fAutocorrelate)
00531         pMeasure = &Autocorrelate;
00532 
00533     if (szGeneFile) {
00534         ifsm.clear();
00535         ifsm.open(szGeneFile);
00536         if (!GenesIn.Open(ifsm)) {
00537             g_CatSleipnir().error(
00538                     "CPCL::Distance( %s, %d, %s, %d, %d, %d, %s, %g ) failed to open genes",
00539                     szFile ? szFile : "stdin", iSkip, szSimilarityMeasure,
00540                     fNormalize, fZScore, fAutocorrelate, szGeneFile, dCutoff);
00541             return 1;
00542         }
00543         ifsm.close();
00544     } else
00545         GenesIn.Open(PCL.GetGeneNames());
00546     veciGenes.resize(GenesIn.GetGenes());
00547     for (i = 0; i < veciGenes.size(); ++i)
00548         veciGenes[i] = szGeneFile ? PCL.GetGene(GenesIn.GetGene(i).GetName())
00549                 : i;
00550 
00551     if (pMeasure->IsRank())
00552         PCL.RankTransform();
00553 
00554     g_CatSleipnir().info("Number of Experiments: %d", PCL.GetExperiments());
00555 
00556     if ((iLimit != -1) && (PCL.GetGenes() > iLimit))
00557         Dat.Open(PCL, pMeasure->Clone(), true);
00558     else {
00559         Dat.Open(GenesIn.GetGeneNames());
00560         for (i = 0; i < Dat.GetGenes(); ++i)
00561             for (j = (i + 1); j < Dat.GetGenes(); ++j)
00562                 Dat.Set(i, j, CMeta::GetNaN());
00563         for (i = 0; i < GenesIn.GetGenes(); ++i) {
00564             if (!(i % 100))
00565                 g_CatSleipnir().info(
00566                         "CPCL::Distance( %s, %d, %s, %d, %d, %d, %s, %g ) processing gene %d/%d",
00567                         szFile ? szFile : "stdin", iSkip, szSimilarityMeasure,
00568                         fNormalize, fZScore, fAutocorrelate,
00569                         szGeneFile ? szGeneFile : "", dCutoff, i,
00570                         GenesIn.GetGenes());
00571             if ((iOne = veciGenes[i]) == -1)
00572                 continue;
00573             adOne = PCL.Get(iOne);
00574             #pragma omp parallel for num_threads(nThreads)
00575             for (j = (i + 1); j < GenesIn.GetGenes(); ++j){
00576                 if ((iTwo = veciGenes[j]) != -1){
00577                     Dat.Set(i, j, (float) pMeasure->Measure(adOne,
00578                             PCL.GetExperiments(), PCL.Get(iTwo),
00579                             PCL.GetExperiments(), eMap, adWeights, adWeights));
00580                 }
00581             }
00582         }
00583 
00584         if (fNormalize || fZScore)
00585             Dat.Normalize(fZScore ? CDat::ENormalizeZScore
00586                     : CDat::ENormalizeMinMax);
00587 
00588         if (!CMeta::IsNaN(dCutoff))
00589             for (i = 0; i < Dat.GetGenes(); ++i)
00590                 for (j = (i + 1); j < Dat.GetGenes(); ++j)
00591                     if (!CMeta::IsNaN(d = Dat.Get(i, j)) && (d < dCutoff))
00592                         Dat.Set(i, j, CMeta::GetNaN());
00593 
00594         if(pMeasure->GetName()=="pearson" || pMeasure->GetName()=="pearnorm"){
00595             vector<unsigned long> bins;
00596             bins.resize(55);
00597             float upper = 0;
00598             float lower = 0;
00599             if(pMeasure->GetName()=="pearson"){
00600                 upper = 1.0;
00601                 lower = -1.0;
00602             }else if(pMeasure->GetName()=="pearnorm"){
00603                 upper = 5.0;
00604                 lower = -5.0;
00605             }
00606             float bin_size = (upper - lower) / 50;
00607             for(i=0; i<55; i++)
00608                 bins[i] = 0;
00609             for(i=0; i<Dat.GetGenes(); i++){
00610                 for(j=i+1; j<Dat.GetGenes(); j++){
00611                     d = Dat.Get(i,j);
00612                     if(CMeta::IsNaN(d)) continue;
00613                     int b = (int) ((d - lower) / bin_size);
00614                     if(b<0){
00615                         bins[0]++;
00616                         continue;
00617                     }
00618                     if(b>=55){
00619                         bins[54]++;
00620                         continue;
00621                     }
00622                     bins[b]++;
00623                 }
00624             }
00625             g_CatSleipnir().info(
00626             "Distribution of distances: bin size: %.5f, number of bins: %d, min bin value: %.5f, max bin value: %.5f",
00627             bin_size, 55, lower, upper);
00628             for(i=0; i<55; i++){
00629                 g_CatSleipnir().info("%lu\t%lu", i, bins[i]);
00630             }
00631         }
00632     }
00633     return 0;
00634 }
00635 
00636 CPCLImpl::~CPCLImpl() {
00637 
00638     Reset();
00639 }
00640 
00641 void CPCLImpl::Reset() {
00642 
00643     m_Data.Reset();
00644     m_vecstrGenes.clear();
00645     m_vecstrExperiments.clear();
00646     m_vecstrFeatures.clear();
00647     m_vecvecstrFeatures.clear();
00648     m_setiGenes.clear();
00649     m_mapstriGenes.clear();
00650 }
00651 
00663 //bool CPCL::Open(const char* szFile, size_t iSkip, bool Memmap) {
00664 //  std::ifstream ifsm;
00665 //  if (szFile)
00666 //      ifsm.open(szFile);
00667 //
00668 //  // is this a binary binary file?
00669 //  if (!strcmp(szFile + strlen(szFile) - strlen(c_szBinExtension),
00670 //          c_szBinExtension))
00671 //      return OpenBinary(szFile ? ifsm : cin, Memmap);
00672 //  else
00673 //      // is this a text based PCL file?
00674 //      return Open(szFile ? ifsm : cin, iSkip);
00675 //}
00676 
00677 
00678 bool CPCL::Open(const char* szFile, size_t iSkip, bool Memmap, bool rTable) {
00679     bool isBinary = false;
00680     bool isDAB = false;
00681     ifstream ifsm;
00682     if (!strcmp(szFile + strlen(szFile) - strlen(c_szBinExtension), c_szBinExtension)) {
00683         isBinary = true;
00684     }
00685     if (!strcmp(szFile + strlen(szFile) - strlen(c_szDabExtension), c_szDabExtension)) {
00686         isDAB = true;
00687     }
00688     if (isBinary && Memmap) {
00689         g_CatSleipnir().notice("CPCL::Open, openning with memory mapping");
00690         Reset();
00691         if (!CMeta::MapRead(m_abData, m_hndlData, m_iData, szFile)) {
00692             g_CatSleipnir().error("CPCL::Open( %s, %d ) failed memory mapping",
00693                     szFile, Memmap);
00694 
00695             return false;
00696         }
00697         bool fRet = OpenHelper();
00698         return fRet;
00699     }
00700     else if (isBinary) {
00701         ifsm.open(szFile, ios::binary);
00702         return OpenBinary(ifsm);
00703     } else if (isDAB) {
00704         CDat dat;
00705         if (!dat.Open(szFile,false, 0, false, false)) {
00706             cerr << "Could not open dab to make pcl." << endl;
00707             return false;
00708         }
00709         std::vector<std::string> features;
00710         std::vector<std::string> genes = dat.GetGeneNames();
00711         Open(genes, genes, features);
00712         for (size_t i = 0; i < dat.GetGenes(); i++) {
00713             for (size_t j = i; j < dat.GetGenes(); j++) {
00714             if (i == j) {
00715                 Set(i, j, 1);
00716             }
00717             else {
00718                 float pairVal = dat.Get(i,j);
00719                 Set(i, j, pairVal);
00720                 Set(j, i, pairVal);
00721             }
00722             }
00723         }
00724         return true;
00725     } else {
00726         ifstream ifsm;
00727         if (szFile)
00728           ifsm.open(szFile);
00729         return Open(szFile ? ifsm : cin, iSkip, rTable);
00730     }
00731 }
00732 
00733 
00734 
00748 void CPCL::Save(const char* szFile) {
00749     std::ofstream ofsm;
00750 
00751     if (szFile)
00752         ofsm.open(szFile);
00753 
00754     // Save as binary PCL file?
00755     if (!strcmp(szFile + strlen(szFile) - strlen(c_szExtension), c_szBinExtension))
00756         SaveBinary(szFile ? ofsm : cout);
00757     else
00758         Save(szFile ? ofsm : cout);
00759         // Save as text file
00760 
00761 }
00762 
00774 void CPCL::Open(const CPCL& PCL) {
00775     size_t i, j;
00776     TSetI::const_iterator iterGene;
00777     TMapStrI::const_iterator iterGene2;
00778 
00779     Reset();
00780     m_fHeader = PCL.m_fHeader;
00781     m_Data.Initialize(PCL.m_Data.GetRows(), PCL.m_Data.GetColumns());
00782     for (i = 0; i < m_Data.GetRows(); ++i)
00783         for (j = 0; j < m_Data.GetColumns(); ++j)
00784             m_Data.Set(i, j, PCL.m_Data.Get(i, j));
00785 
00786     for (iterGene = PCL.m_setiGenes.begin(); iterGene != PCL.m_setiGenes.end(); ++iterGene)
00787         m_setiGenes.insert(*iterGene);
00788     m_vecstrExperiments.resize(PCL.m_vecstrExperiments.size());
00789     copy(PCL.m_vecstrExperiments.begin(), PCL.m_vecstrExperiments.end(),
00790             m_vecstrExperiments.begin());
00791     m_vecstrFeatures.resize(PCL.m_vecstrFeatures.size());
00792     copy(PCL.m_vecstrFeatures.begin(), PCL.m_vecstrFeatures.end(),
00793             m_vecstrFeatures.begin());
00794     m_vecstrGenes.resize(PCL.m_vecstrGenes.size());
00795     copy(PCL.m_vecstrGenes.begin(), PCL.m_vecstrGenes.end(),
00796             m_vecstrGenes.begin());
00797     for (iterGene2 = PCL.m_mapstriGenes.begin(); iterGene2
00798             != PCL.m_mapstriGenes.end(); ++iterGene2)
00799         m_mapstriGenes[iterGene2->first] = iterGene2->second;
00800     m_vecvecstrFeatures.resize(PCL.m_vecvecstrFeatures.size());
00801     for (i = 0; i < m_vecvecstrFeatures.size(); ++i) {
00802         m_vecvecstrFeatures[i].resize(PCL.m_vecvecstrFeatures[i].size());
00803         copy(PCL.m_vecvecstrFeatures[i].begin(),
00804                 PCL.m_vecvecstrFeatures[i].end(),
00805                 m_vecvecstrFeatures[i].begin());
00806     }
00807 }
00808 
00829 bool CPCL::Open(std::istream& istm, size_t iSkip, bool rTable) {
00830     vector<float> vecdData;
00831     size_t i;
00832     char* acBuf;
00833     bool fRet;
00834     string acStr;
00835 
00836     if (!istm.good())
00837         return false;
00838 
00839     acStr.reserve(c_iBufferSize);
00840     
00841     if (!OpenExperiments(istm, iSkip, acStr, rTable))
00842         fRet = false;
00843     else {
00844         m_vecvecstrFeatures.resize(m_vecstrFeatures.size() - 1);
00845         while (OpenGene(istm, vecdData, acStr))
00846             ;
00847         for (fRet = !!GetGenes(), i = 0; i < GetGenes(); ++i)
00848             if (GetGene(i).empty() || !isprint(GetGene(i)[0])) {
00849                 fRet = false;
00850                 g_CatSleipnir().error(
00851                         "CPCL::Open( %d ) invalid gene at index %d: %s", iSkip,
00852                         i, GetGene(i).c_str());
00853                 break;
00854             }
00855         if (fRet) {
00856             m_Data.Initialize(GetGenes(), GetExperiments());
00857             for (i = 0; i < m_Data.GetRows(); ++i)
00858                 m_Data.Set(i, &vecdData[i * m_Data.GetColumns()]);
00859         }
00860     }
00861     //delete[] acBuf;
00862 
00863     return fRet;
00864 }
00865 
00866 bool CPCLImpl::OpenHelper() {
00867     unsigned char* pb;
00868     pb = m_abData;
00869     size_t i, j;
00870     m_vecstrFeatures.resize(*(uint32_t*) pb);
00871     pb += sizeof(uint32_t);
00872     for (i = 0; i < m_vecstrFeatures.size(); ++i) {
00873         string& strFeature = m_vecstrFeatures[i];
00874         strFeature.resize(*(uint32_t*) pb);
00875         pb += sizeof(uint32_t);
00876         for (j = 0; j < strFeature.size(); ++j)
00877             strFeature[j] = *pb++;
00878     }
00879     m_vecstrExperiments.resize(*(uint32_t*) pb);
00880     i = 0;
00881     pb += sizeof(uint32_t);
00882     for (i = 0; i < m_vecstrExperiments.size(); ++i) {
00883         string& strExperiment = m_vecstrExperiments[i];
00884         strExperiment.resize(*(uint32_t*) pb);
00885         pb += sizeof(uint32_t);
00886         for (j = 0; j < strExperiment.size(); ++j) {
00887             strExperiment[j] = *pb++;
00888         }
00889     }
00890     m_vecstrGenes.resize(*(uint32_t*) pb);
00891     pb += sizeof(uint32_t);
00892     for (i = 0; i < m_vecstrGenes.size(); ++i) {
00893         string& strGene = m_vecstrGenes[i];
00894         strGene.resize(*(uint32_t*) pb);
00895         pb += sizeof(uint32_t);
00896         for (j = 0; j < strGene.size(); ++j)
00897             strGene[j] = *pb++;
00898         m_mapstriGenes[strGene] = i;
00899     }
00900     return OpenMemmap(pb);
00901 
00902 }
00903 
00904 bool CPCLImpl::OpenMemmap(const unsigned char* pb) {
00905     size_t i;
00906 
00907     m_aadData = new float*[m_vecstrGenes.size()];
00908     m_aadData[0] = (float*) pb;
00909     for (i = 1; i < m_vecstrGenes.size(); ++i)
00910         m_aadData[i] = m_aadData[i - 1] + m_vecstrExperiments.size();
00911     m_Data.Initialize(m_vecstrGenes.size(), m_vecstrExperiments.size(), (float**) m_aadData);
00912     return true;
00913 }
00914 
00915 bool CPCLImpl::OpenExperiments(std::istream& istmInput, size_t iFeatures,
00916                    string& acStr, bool rTable) {
00917     const char* pc;
00918     string strToken;
00919     size_t iToken;
00920     
00921     Reset();
00922     if (!m_fHeader) {
00923         m_vecstrFeatures.resize(1 + iFeatures);
00924         return true;
00925     }
00926     getline(istmInput, acStr);
00927     if (!acStr.size()) {
00928         return false;
00929     }
00930     
00931     if(rTable){   
00932       m_vecstrFeatures.push_back("");
00933     }
00934     
00935     for (iToken = 0, pc = acStr.c_str(); (strToken = OpenToken(pc, &pc)).length()
00936             || *pc; ++iToken)
00937       if (iToken < iFeatures)
00938         m_vecstrFeatures.push_back(strToken);
00939       else if (iToken <= iFeatures && !rTable)
00940         m_vecstrFeatures.push_back(strToken);
00941       else
00942         m_vecstrExperiments.push_back(strToken);
00943     
00944     return true;
00945 }
00946 
00947 bool CPCLImpl::OpenGene(std::istream& istmInput, std::vector<float>& vecdData,
00948         string& acStr) {
00949     const char* pc;
00950     char* pcEnd;
00951     string strToken;
00952     size_t iToken, iData, iBase, i;
00953     float d;
00954     map<string, size_t> mapstriValues;
00955     map<string, size_t>::iterator iterValue;
00956     const char* acLine = acStr.c_str();
00957 
00958     iBase = vecdData.size();
00959     getline(istmInput, acStr);
00960     if ((strToken = OpenToken(acLine)).empty())
00961         return false;
00962     if (strToken == "EWEIGHT")
00963         return true;
00964     if (!m_vecstrExperiments.empty())
00965         vecdData.resize(vecdData.size() + m_vecstrExperiments.size());
00966     for (iData = iToken = 0, pc = acLine; (strToken = OpenToken(pc, &pc)).length()
00967             || *pc; ++iToken) {
00968         if (!iToken) {
00969             m_mapstriGenes[strToken] = m_vecstrGenes.size();
00970             m_vecstrGenes.push_back(strToken);
00971         } else if (iToken < m_vecstrFeatures.size())
00972             m_vecvecstrFeatures[iToken - 1].push_back(strToken);
00973         else if (!m_vecstrExperiments.empty() && (iData
00974                 >= m_vecstrExperiments.size()))
00975             return false;
00976         else {
00977             d = CMeta::GetNaN();
00978             strToken = CMeta::Trim(strToken.c_str());
00979             if (strToken.length()) {
00980                 d = (float) strtod(strToken.c_str(), &pcEnd);
00981                 if (pcEnd != (strToken.c_str() + strToken.length())) {
00982                     iterValue = mapstriValues.find(strToken);
00983                     if (iterValue == mapstriValues.end()) {
00984                         i = mapstriValues.size();
00985                         mapstriValues[strToken] = i;
00986                         d = i;
00987                     } else
00988                         d = iterValue->second;
00989                 }
00990             }
00991             if (m_vecstrExperiments.empty())
00992                 vecdData.push_back(d);
00993             else if ((i = (iBase + iData++)) >= vecdData.size())
00994                 return false;
00995             else
00996                 vecdData[i] = d;
00997         }
00998     }
00999 
01000     if (m_vecstrExperiments.empty())
01001         m_vecstrExperiments.resize(vecdData.size());
01002     else
01003         while (iData < m_vecstrExperiments.size())
01004             vecdData[iBase + iData++] = CMeta::GetNaN();
01005 
01006     return !!iToken;
01007 }
01008 
01026 void CPCL::SaveHeader(std::ostream& ostm, bool fCDT) const {
01027     size_t i;
01028 
01029     if (!m_fHeader)
01030         return;
01031 
01032 /*  if (fCDT)
01033         ostm << c_szGID << '\t'; */
01034     ostm << m_vecstrFeatures[0]; //Gene name
01035 //  for (i = 1; i < m_vecstrFeatures.size(); ++i)
01036 //      ostm << '\t' << m_vecstrFeatures[i];
01037     for (i = 0; i < m_vecstrExperiments.size(); ++i)
01038         ostm << '\t' << m_vecstrExperiments[i];
01039     ostm << endl;
01040 
01041 //  ostm << c_szEWEIGHT;
01042 //  for (i = fCDT ? 0 : 1; i < m_vecstrFeatures.size(); ++i)
01043 //      ostm << '\t';
01044 //  for (i = 0; i < m_vecstrExperiments.size(); ++i)
01045 //      ostm << '\t' << 1;
01046 //  ostm << endl;
01047 }
01048 
01070 void CPCL::SaveGene(std::ostream& ostm, size_t iGene, size_t iOriginal) const {
01071     size_t i;
01072     float d;
01073 
01074     if (iOriginal != -1)
01075         ostm << c_szGENE << iOriginal << '\t';
01076     ostm << GetGene(iGene);
01077     for (i = 0; i < m_vecvecstrFeatures.size(); ++i)
01078         ostm << '\t' << m_vecvecstrFeatures[i][iGene];
01079     for (i = 0; i < m_vecstrExperiments.size(); ++i) {
01080         ostm << '\t';
01081         if (!CMeta::IsNaN(d = Get(iGene, i)))
01082             ostm << Get(iGene, i);
01083     }
01084     ostm << endl;
01085 }
01086 
01108 //Save only certain genes
01109 bool CPCL::Save(const char* szFile, const std::vector<size_t>* pveciGenes) const {
01110     ofstream ofsm;
01111     if (!strcmp(szFile + strlen(szFile) - 4, ".bin")) {
01112         ofsm.open(szFile, ios::binary);
01113         SaveBinary(ofsm);
01114     } else {
01115         ofsm.open(szFile);
01116         Save(ofsm, pveciGenes);
01117     }
01118 
01119 }
01120 void CPCL::Save(std::ostream& ostm, const std::vector<size_t>* pveciGenes) const {
01121     size_t i;
01122 
01123     SaveHeader(ostm, !!pveciGenes);
01124 
01125     for (i = 0; i < GetGenes(); ++i) {
01126         if (m_setiGenes.find(i) != m_setiGenes.end())
01127             continue;
01128         SaveGene(ostm, i, pveciGenes ? (*pveciGenes)[i] : -1);
01129     }
01130 }
01131 
01147 void CPCL::SaveBinary(std::ostream& ostm) const {
01148     uint32_t iTmp;
01149     size_t i;
01150 
01151     iTmp = GetFeatures();
01152     ostm.write((const char*) &iTmp, sizeof(iTmp));
01153     for (i = 0; i < GetFeatures(); ++i)
01154         SaveString(ostm, GetFeature(i));
01155     iTmp = GetExperiments();
01156     ostm.write((const char*) &iTmp, sizeof(iTmp));
01157     for (i = 0; i < GetExperiments(); ++i)
01158         SaveString(ostm, GetExperiment(i));
01159     size_t iCount = 0;
01160     for (i = 0; i < GetGenes(); i++)
01161         if (!IsMasked(i))
01162             iCount++;
01163     iTmp = iCount;
01164     ostm.write((const char*) &iTmp, sizeof(iTmp));
01165     for (i = 0; i < GetGenes(); ++i)
01166         if (!IsMasked(i))
01167         SaveString(ostm, GetGene(i));
01168 
01169     for (i = 0; i < GetGenes(); ++i)
01170         if (!IsMasked(i))
01171         ostm.write((const char*) Get(i), GetExperiments() * sizeof(*Get(i)));
01172 }
01173 
01184 bool CPCL::OpenBinary(std::istream& istm) {
01185     uint32_t iTmp;
01186     size_t i;
01187 
01188     if (!istm.good())
01189         return false;
01190     
01191 
01192     Reset();
01193     istm.read((char*) &iTmp, sizeof(iTmp));
01194     m_vecstrFeatures.resize(iTmp);
01195     for (i = 0; i < m_vecstrFeatures.size(); ++i)
01196         OpenString(istm, m_vecstrFeatures[i]);
01197 
01198     istm.read((char*) &iTmp, sizeof(iTmp));
01199     m_vecstrExperiments.resize(iTmp);
01200     for (i = 0; i < m_vecstrExperiments.size(); ++i)
01201         OpenString(istm, m_vecstrExperiments[i]);
01202 
01203     istm.read((char*) &iTmp, sizeof(iTmp));
01204     m_vecstrGenes.resize(iTmp);
01205     for (i = 0; i < m_vecstrGenes.size(); ++i) {
01206         OpenString(istm, m_vecstrGenes[i]);
01207         m_mapstriGenes[m_vecstrGenes[i]] = i;
01208     }
01209 
01210     m_Data.Initialize(GetGenes(), GetExperiments());
01211     for (i = 0; i < m_Data.GetRows(); ++i)
01212         istm.read((char*) m_Data.Get(i), GetExperiments() * sizeof(*m_Data.Get(
01213                 i)));
01214 
01215     return true;
01216 }
01217 
01236 void CPCL::Open(const std::vector<std::string>& vecstrGenes, const std::vector<
01237         std::string>& vecstrExperiments,
01238         const std::vector<std::string>& vecstrFeatures) {
01239     size_t i, j;
01240 
01241     Reset();
01242     if (vecstrFeatures.empty())
01243         m_vecstrFeatures.push_back("GID");
01244     else {
01245         m_vecstrFeatures.resize(vecstrFeatures.size());
01246         copy(vecstrFeatures.begin(), vecstrFeatures.end(),
01247                 m_vecstrFeatures.begin());
01248         m_vecvecstrFeatures.resize(m_vecstrFeatures.size() - 1);
01249         for (i = 0; i < m_vecvecstrFeatures.size(); ++i)
01250             m_vecvecstrFeatures[i].resize(vecstrGenes.size());
01251     }
01252 
01253     m_vecstrGenes.resize(vecstrGenes.size());
01254     for (i = 0; i < GetGenes(); ++i)
01255         SetGene(i, vecstrGenes[i]);
01256     m_vecstrExperiments.resize(vecstrExperiments.size());
01257     for (i = 0; i < GetExperiments(); ++i)
01258         SetExperiment(i, vecstrExperiments[i]);
01259 
01260     m_Data.Initialize(GetGenes(), GetExperiments());
01261     for (i = 0; i < m_Data.GetRows(); ++i)
01262         for (j = 0; j < m_Data.GetColumns(); ++j)
01263             m_Data.Set(i, j, CMeta::GetNaN());
01264 }
01265 
01291 void CPCL::Open(const std::vector<size_t>& veciGenes, const std::vector<
01292         std::string>& vecstrGenes,
01293         const std::vector<std::string>& vecstrExperiments) {
01294     size_t i, j;
01295     char ac[16];
01296 
01297     Reset();
01298     m_vecstrFeatures.resize(4);
01299     m_vecstrFeatures[0] = "GID";
01300     m_vecstrFeatures[1] = "YORF";
01301     m_vecstrFeatures[2] = "NAME";
01302     m_vecstrFeatures[3] = "GWEIGHT";
01303     m_vecvecstrFeatures.resize(m_vecstrFeatures.size() - 1);
01304     for (i = 0; i < m_vecvecstrFeatures.size(); ++i)
01305         m_vecvecstrFeatures[i].resize(veciGenes.size());
01306     for (i = 0; i < veciGenes.size(); ++i) {
01307         m_vecvecstrFeatures[0][i] = m_vecvecstrFeatures[1][i]
01308                 = vecstrGenes[veciGenes[i]];
01309         m_vecvecstrFeatures[2][i] = "1";
01310     }
01311 
01312     m_vecstrGenes.resize(vecstrGenes.size());
01313     for (i = 0; i < m_vecstrGenes.size(); ++i) {
01314         m_vecstrGenes[i] = "GENE";
01315         sprintf_s(ac, "%d", veciGenes[i]);
01316         m_vecstrGenes[i] += ac;
01317         m_mapstriGenes[m_vecstrGenes[i]] = i;
01318     }
01319     m_vecstrExperiments.resize(vecstrExperiments.size());
01320     for (i = 0; i < m_vecstrExperiments.size(); ++i)
01321         m_vecstrExperiments[i] = vecstrExperiments[i];
01322 
01323     m_Data.Initialize(GetGenes(), GetExperiments());
01324     for (i = 0; i < m_Data.GetRows(); ++i)
01325         for (j = 0; j < m_Data.GetColumns(); ++j)
01326             m_Data.Set(i, j, CMeta::GetNaN());
01327 }
01328 
01342 bool CPCL::SortGenes(const vector<size_t>& veciOrder) {
01343     size_t i;
01344 
01345     if (veciOrder.size() != m_Data.GetRows())
01346         return false;
01347 
01348     CMeta::Permute(m_Data.Get(), veciOrder);
01349     CMeta::Permute(m_vecstrGenes, veciOrder);
01350     for (i = 0; i < GetGenes(); ++i)
01351         m_mapstriGenes[GetGene(i)] = i;
01352     for (i = 0; i < m_vecvecstrFeatures.size(); ++i)
01353         CMeta::Permute(m_vecvecstrFeatures[i], veciOrder);
01354 
01355     return true;
01356 }
01357 
01369 void CPCL::RankTransform() {
01370     size_t i, j, k;
01371     vector<size_t> veciRanks, veciCounts;
01372 
01373     veciRanks.resize(GetExperiments());
01374     veciCounts.resize(GetExperiments());
01375     for (i = 0; i < GetGenes(); ++i) {
01376         fill(veciRanks.begin(), veciRanks.end(), 0);
01377         for (j = 0; j < GetExperiments(); ++j) {
01378             if (CMeta::IsNaN(Get(i, j)))
01379                 continue;
01380             for (k = 0; k < GetExperiments(); ++k) {
01381                 if (CMeta::IsNaN(Get(i, k)))
01382                     continue;
01383                 if ((j != k) && (Get(i, k) < Get(i, j)))
01384                     veciRanks[j]++;
01385             }
01386         }
01387 
01388         fill(veciCounts.begin(), veciCounts.end(), 0);
01389         for (j = 0; j < GetExperiments(); ++j)
01390             if (!CMeta::IsNaN(Get(i, j)))
01391                 veciCounts[veciRanks[j]]++;
01392 
01393         for (j = 0; j < GetExperiments(); ++j)
01394             if (!CMeta::IsNaN(Get(i, j))) {
01395                 k = veciRanks[j];
01396                 // Closed form for sum(rank, rank + n) / n
01397                 Set(i, j, k + ((veciCounts[k] + 1) / 2.0f));
01398             }
01399     }
01400 }
01401 
01415 bool CPCL::AddGenes(const std::vector<std::string>& vecstrGenes) {
01416     size_t i, j, iStart;
01417 
01418     iStart = m_Data.GetRows();
01419     if (!m_Data.AddRows(vecstrGenes.size()))
01420         return false;
01421     for (i = iStart; i < m_Data.GetRows(); ++i)
01422         for (j = 0; j < m_Data.GetColumns(); ++j)
01423             m_Data.Set(i, j, CMeta::GetNaN());
01424 
01425     m_vecstrGenes.resize(m_vecstrGenes.size() + vecstrGenes.size());
01426     for (i = 0; i < vecstrGenes.size(); ++i)
01427         SetGene(iStart + i, vecstrGenes[i]);
01428     for (i = 0; i < m_vecvecstrFeatures.size(); ++i) {
01429         m_vecvecstrFeatures[i].resize(m_vecvecstrFeatures[i].size()
01430                 + vecstrGenes.size());
01431         if (m_vecstrFeatures[i + 1] == c_szNAME)
01432             for (j = 0; j < vecstrGenes.size(); ++j)
01433                 m_vecvecstrFeatures[i][iStart + j] = vecstrGenes[j];
01434         else if (m_vecstrFeatures[i + 1] == c_szGWEIGHT)
01435             for (j = 0; j < vecstrGenes.size(); ++j)
01436                 m_vecvecstrFeatures[i][iStart + j] = c_szOne;
01437     }
01438 
01439     return true;
01440 }
01441 
01452 void CPCL::Normalize(ENormalize eNormalize) {
01453     size_t i, j, iCount;
01454     double dAve, dStd;
01455     float d, dMin, dMax;
01456 
01457     switch (eNormalize) {
01458     case ENormalizeMean:
01459         dMin = FLT_MAX;
01460         dAve = 0;
01461         for (iCount = i = 0; i < GetGenes(); ++i)
01462             for (j = 0; j < GetExperiments(); ++j)
01463                 if (!CMeta::IsNaN(d = Get(i, j))) {
01464                     if (d < dMin)
01465                         dMin = d;
01466                     iCount++;
01467                     dAve += d;
01468                 }
01469         if (!iCount)
01470             break;
01471         dAve = (dAve / iCount) - dMin;
01472         for (i = 0; i < GetGenes(); ++i)
01473             for (j = 0; j < GetExperiments(); ++j)
01474                 if (!CMeta::IsNaN(d = Get(i, j)))
01475                     Set(i, j, (float) ((d - dMin) / dAve));
01476         break;
01477 
01478     case ENormalizeZScore:
01479         dAve = dStd = 0;
01480         for (iCount = i = 0; i < GetGenes(); ++i)
01481             for (j = 0; j < GetExperiments(); ++j)
01482                 if (!CMeta::IsNaN(d = Get(i, j))) {
01483                     iCount++;
01484                     dAve += d;
01485                     dStd += d * d;
01486                 }
01487         dAve /= iCount;
01488         dStd = sqrt((dStd / iCount) - (dAve * dAve));
01489         for (i = 0; i < GetGenes(); ++i)
01490             for (j = 0; j < GetExperiments(); ++j)
01491                 if (!CMeta::IsNaN(d = Get(i, j)))
01492                     Set(i, j, (float) ((d - dAve) / dStd));
01493         break;
01494 
01495     case ENormalizeRow:
01496         for (i = 0; i < GetGenes(); ++i) {
01497             dAve = CStatistics::Average(Get(i), Get(i) + GetExperiments());
01498             dStd = sqrt(CStatistics::Variance(Get(i),
01499                     Get(i) + GetExperiments(), dAve));
01500             if (dStd)
01501                 for (j = 0; j < GetExperiments(); ++j)
01502                     Set(i, j, (float) ((Get(i, j) - dAve) / dStd));
01503         }
01504         break;
01505 
01506     case ENormalizeColumn:
01507     case ENormalizeColumnCenter:
01508     case ENormalizeColumnFraction:
01509         for (i = 0; i < GetExperiments(); ++i) {
01510             dAve = dStd = 0;
01511             for (iCount = j = 0; j < GetGenes(); ++j)
01512                 if (!CMeta::IsNaN(d = Get(j, i))) {
01513                     iCount++;
01514                     dAve += d;
01515                     dStd += d * d;
01516                 }
01517             if (iCount) {
01518                 if (eNormalize != ENormalizeColumnFraction) {
01519                     dAve /= iCount;
01520                     dStd = (dStd / iCount) - (dAve * dAve);
01521                     dStd = (dStd <= 0) ? 1 : sqrt(dStd);
01522                     if (eNormalize == ENormalizeColumnCenter)
01523                         dStd = 1;
01524                 }
01525                 for (j = 0; j < GetGenes(); ++j)
01526                     if (!CMeta::IsNaN(d = Get(j, i))) {
01527                         if (eNormalize == ENormalizeColumnFraction)
01528                             d /= (float) dAve;
01529                         else
01530                             d = (float) ((d - dAve) / dStd);
01531                         Set(j, i, d);
01532                     }
01533             }
01534         }
01535         break;
01536 
01537     case ENormalizeMinMax:
01538         dMin = FLT_MAX;
01539         dMax = -dMin;
01540         for (i = 0; i < GetGenes(); ++i)
01541             for (j = 0; j < GetExperiments(); ++j)
01542                 if (!CMeta::IsNaN(d = Get(i, j))) {
01543                     if (d < dMin)
01544                         dMin = d;
01545                     if (d > dMax)
01546                         dMax = d;
01547                 }
01548         if (!(dMax -= dMin))
01549             dMax = 1;
01550         for (i = 0; i < GetGenes(); ++i)
01551             for (j = 0; j < GetExperiments(); ++j)
01552                 if (!CMeta::IsNaN(d = Get(i, j)))
01553                     Set(i, j, (d - dMin) / dMax);
01554         break;
01555     case EMeanSubtractColumn:
01556         for (i = 0; i < GetExperiments(); ++i) {
01557             dAve = 0;
01558             for (iCount = j = 0; j < GetGenes(); ++j)
01559                 if (!CMeta::IsNaN(d = Get(j, i))) {
01560                     iCount++;
01561                     dAve += d;
01562 
01563                 }
01564 
01565             if (iCount) {
01566                 dAve /= iCount;
01567                 for (j = 0; j < GetGenes(); ++j)
01568                     if (!CMeta::IsNaN(d = Get(j, i)))
01569                         Set(j, i, (float) (d - dAve));
01570             }
01571         }
01572         break;
01573     }
01574 }
01575 
01600 void CPCL::Impute(size_t iNeighbors, float dMinimumPresent,
01601         const CDat& DatSimilarity) {
01602     vector<float> vecdMissing;
01603     vector<size_t> veciMissing;
01604     vector<SNeighbors> vecsNeighbors;
01605     size_t i, j, k, iCol, iMissing, iOne, iTwo;
01606     float d, dSum;
01607     const float* ad;
01608 
01609     vecdMissing.resize(GetGenes());
01610     for (i = 0; i < vecdMissing.size(); ++i) {
01611         for (iMissing = j = 0; j < GetExperiments(); ++j)
01612             if (CMeta::IsNaN(Get(i, j)))
01613                 iMissing++;
01614         if ((vecdMissing[i] = (float) (GetExperiments() - iMissing)
01615                 / GetExperiments()) >= dMinimumPresent)
01616             veciMissing.push_back(i);
01617     }
01618     if (iNeighbors) {
01619         vecsNeighbors.resize(veciMissing.size());
01620         for (i = 0; i < vecsNeighbors.size(); ++i)
01621             vecsNeighbors[i].Initialize(Get(veciMissing[i]), GetExperiments(),
01622                     iNeighbors);
01623         for (i = 0; i < veciMissing.size(); ++i) {
01624             if (!(i % 100))
01625                 g_CatSleipnir().info(
01626                         "CPCL::Impute( %d, %g ) finding neighbors for gene %d/%d",
01627                         iNeighbors, dMinimumPresent, i, veciMissing.size());
01628             ad = Get(iOne = veciMissing[i]);
01629             for (j = (i + 1); j < veciMissing.size(); ++j) {
01630                 d = DatSimilarity.Get(iOne, iTwo = veciMissing[j]);
01631                 vecsNeighbors[i].Add(iTwo, d, Get(iTwo));
01632                 vecsNeighbors[j].Add(iOne, d, ad);
01633             }
01634         }
01635     }
01636 
01637     for (iOne = i = 0; i < GetGenes(); ++i) {
01638         if (vecdMissing[i] < dMinimumPresent) {
01639             MaskGene(i);
01640             continue;
01641         }
01642         if (vecsNeighbors.empty())
01643             continue;
01644         {
01645             const SNeighbors& sGene = vecsNeighbors[iOne++];
01646 
01647             for (j = 0; j < GetExperiments(); ++j) {
01648                 if (!CMeta::IsNaN(Get(i, j)))
01649                     continue;
01650 
01651                 iCol = sGene.GetColumn(j);
01652                 for (d = dSum = 0, iMissing = k = 0; k < iNeighbors; ++k) {
01653                     const pair<size_t, float>& prNeighbor =
01654                             sGene.m_MatNeighbors.Get(k, iCol);
01655 
01656                     if (prNeighbor.second == -FLT_MAX)
01657                         continue;
01658                     iMissing++;
01659                     dSum += prNeighbor.second;
01660                     d += Get(prNeighbor.first, j) * prNeighbor.second;
01661                 }
01662                 if (dSum)
01663                     d /= dSum;
01664                 Set(i, j, iMissing ? d : CMeta::GetNaN());
01665             }
01666         }
01667     }
01668 }
01669 
01701 void CPCL::Impute(size_t iNeighbors, float dMinimumPresent,
01702         const IMeasure* pMeasure, bool fPrecompute) {
01703     CDat Dat;
01704     size_t i, j;
01705 
01706     if (fPrecompute) {
01707         Dat.Open(GetGeneNames());
01708         for (i = 0; i < Dat.GetGenes(); ++i)
01709             for (j = (i + 1); j < Dat.GetGenes(); ++j)
01710                 Dat.Set(i, j, (float) pMeasure->Measure(Get(i),
01711                         GetExperiments(), Get(j), GetExperiments()));
01712     } else
01713         Dat.Open(*this, pMeasure, false);
01714     Impute(iNeighbors, dMinimumPresent, Dat);
01715 }
01716 
01733 void CPCL::MedianMultiples(size_t iSample, size_t iBins, float dBinSize) {
01734     size_t i, j, k, iBin, iOne, iCutoff;
01735     CMeasureEuclidean Euclidean;
01736     vector<float> vecdEuclidean, vecdMapped, vecdBG, vecdFG, vecdMean;
01737     float d, dAve, dStd, dSample;
01738     vector<vector<size_t> > vecveciGenes;
01739     vector<size_t> veciMean;
01740     vector<string> vecstrAdd;
01741 
01742     {
01743         map<string, size_t> mapstriClasses;
01744         map<string, size_t>::iterator iterClass;
01745 
01746         for (i = 0; i < m_vecstrGenes.size(); ++i) {
01747             const string& strGene = m_vecstrGenes[i];
01748 
01749             if ((iterClass = mapstriClasses.find(strGene))
01750                     == mapstriClasses.end()) {
01751                 mapstriClasses[strGene] = vecveciGenes.size();
01752                 vecveciGenes.push_back(vector<size_t> ());
01753                 vecveciGenes.back().push_back(i);
01754             } else
01755                 vecveciGenes[iterClass->second].push_back(i);
01756         }
01757     }
01758 
01759     dSample = 2.0f * iSample / GetGenes() / (GetGenes() - 1);
01760     dAve = dStd = 0;
01761     for (k = i = 0; i < GetGenes(); ++i)
01762         for (j = (i + 1); j < GetGenes(); ++j) {
01763             if (((float) rand() / RAND_MAX) >= dSample)
01764                 continue;
01765             vecdEuclidean.push_back(d = (float) Euclidean.Measure(Get(i),
01766                     GetExperiments(), Get(j), GetExperiments(),
01767                     IMeasure::EMapNone));
01768             k++;
01769             dAve += d;
01770             dStd += d * d;
01771         }
01772     dAve /= k;
01773     dStd = sqrt((dStd / (k - 1)) - (dAve * dAve));
01774     if (!(iBins % 2))
01775         iBins++;
01776 
01777     vecdBG.resize(iBins);
01778     for (i = 0; i < vecdEuclidean.size(); ++i)
01779         vecdBG[MedianMultiplesBin(vecdEuclidean[i], dAve, dStd, iBins, dBinSize)]++;
01780     for (i = 0; i < vecdBG.size(); ++i)
01781         //      vecdBG[i] /= vecdEuclidean.size( );
01782         vecdBG[i] = (vecdBG[i] + 1) / (vecdEuclidean.size() + vecdBG.size());
01783     //*
01784     MedianMultiplesSmooth(2, vecdBG);
01785     //*/
01786 
01787     MedianMultiplesMapped(vecveciGenes, vecdMapped);
01788     vecdFG.resize(iBins);
01789     for (i = 0; i < vecdMapped.size(); ++i)
01790         vecdFG[MedianMultiplesBin(vecdMapped[i], dAve, dStd, iBins, dBinSize)]++;
01791     /*
01792      dMax = 0;
01793      for( iMax = i = 0; i < vecdBG.size( ); ++i )
01794      if( vecdBG[i] > dMax ) {
01795      dMax = vecdBG[i];
01796      iMax = i; }
01797      for( i = 0; i <= iMax; ++i )
01798      vecdFG[i]++;
01799      for( i = 0; i < vecdFG.size( ); ++i )
01800      vecdFG[i] /= vecdMapped.size( ) + iMax + 1;
01801      //*/
01802     //*/
01803     for (i = 0; i < vecdFG.size(); ++i)
01804         //      vecdFG[i] /= vecdMapped.size( );
01805         vecdFG[i] = (vecdFG[i] + 1) / (vecdMapped.size() + vecdFG.size());
01806     MedianMultiplesSmooth(2, vecdFG);
01807     /*
01808      for( i = 0; i < vecdBG.size( ); ++i )
01809      cerr << vecdBG[i] << '\t';
01810      cerr << endl;
01811      for( i = 0; i < vecdFG.size( ); ++i )
01812      cerr << vecdFG[i] << '\t';
01813      cerr << endl;
01814      //*/
01815     for (iCutoff = 0; iCutoff < vecdFG.size(); ++iCutoff)
01816         if (vecdFG[iCutoff] < vecdBG[iCutoff])
01817             break;
01818 
01819     vecstrAdd.resize(1);
01820     veciMean.resize(GetExperiments());
01821     vecdMean.resize(GetExperiments());
01822     for (i = 0; i < vecveciGenes.size(); ++i) {
01823         const vector<size_t>& veciGenes = vecveciGenes[i];
01824         vector<size_t> veciAgree;
01825 
01826         if (veciGenes.size() < 2)
01827             continue;
01828         veciAgree.resize(veciGenes.size());
01829         for (j = 0; j < veciGenes.size(); ++j) {
01830             iOne = veciGenes[j];
01831             for (k = (j + 1); k < veciGenes.size(); ++k) {
01832                 iBin = MedianMultiplesBin((float) Euclidean.Measure(Get(iOne),
01833                         GetExperiments(), Get(veciGenes[k]), GetExperiments(),
01834                         IMeasure::EMapNone), dAve, dStd, iBins, dBinSize);
01835                 //              if( iBin <= iCutoff ) {
01836                 if ((1.01 * vecdFG[iBin]) >= vecdBG[iBin]) {
01837                     veciAgree[j]++;
01838                     veciAgree[k]++;
01839                 }
01840             }
01841         }
01842 
01843         fill(vecdMean.begin(), vecdMean.end(), 0.0f);
01844         fill(veciMean.begin(), veciMean.end(), 0);
01845         for (iBin = j = 0; j < veciGenes.size(); ++j)
01846             if ((2 * veciAgree[j]) >= (veciAgree.size() - 1)) {
01847                 iBin++;
01848                 iOne = veciGenes[j];
01849                 for (k = 0; k < GetExperiments(); ++k)
01850                     if (!CMeta::IsNaN(d = Get(iOne, k))) {
01851                         veciMean[k]++;
01852                         vecdMean[k] += d;
01853                     }
01854             }
01855 
01856         for (j = 0; j < veciGenes.size(); ++j)
01857             MaskGene(veciGenes[j]);
01858         if ((iBin * 2) > veciAgree.size()) {
01859             for (j = 0; j < vecdMean.size(); ++j)
01860                 vecdMean[j] = veciMean[j] ? (vecdMean[j] / veciMean[j])
01861                         : CMeta::GetNaN();
01862             vecstrAdd[0] = GetGene(veciGenes[0]);
01863             AddGenes(vecstrAdd);
01864             Set(GetGenes() - 1, &vecdMean[0]);
01865         }
01866     }
01867 }
01868 
01869 void CPCLImpl::MedianMultiplesMapped(
01870         const vector<vector<size_t> >& vecveciGenes, vector<float>& vecdMapped) {
01871     size_t i, j, k;
01872     CMeasureEuclidean Euclidean;
01873     const float* adOne;
01874 
01875     for (i = 0; i < vecveciGenes.size(); ++i) {
01876         const vector<size_t>& veciGenes = vecveciGenes[i];
01877 
01878         if (veciGenes.size() < 2)
01879             continue;
01880         for (j = 0; j < veciGenes.size(); ++j) {
01881             adOne = m_Data.Get(veciGenes[j]);
01882             for (k = (j + 1); k < veciGenes.size(); ++k)
01883                 vecdMapped.push_back((float) Euclidean.Measure(adOne,
01884                         m_Data.GetColumns(), m_Data.Get(veciGenes[k]),
01885                         m_Data.GetColumns(), IMeasure::EMapNone));
01886         }
01887     }
01888 }
01889 
01895   bool CPCL::populate(const char* szFile, float dDefault){
01896   ifstream istm;
01897   const char*   pc;
01898   char*     pcTail;
01899   char*     acBuf;
01900   string        strToken, strCache, strValue;
01901   size_t        iOne, iTwo, i;
01902   float     dScore;
01903   
01904   istm.open( szFile );
01905   
01906   acBuf = new char[ c_iBufferSize ];
01907   while( istm.peek( ) != EOF ) {
01908     istm.getline( acBuf, c_iBufferSize - 1 );
01909     strToken = OpenToken( acBuf, &pc );
01910     if( !strToken.length( ) )
01911       break;
01912     //if( strToken == c_acComment )
01913     //  continue;
01914     if( strToken != strCache ) {
01915       strCache = strToken;          
01916       iOne = GetGene( strToken );
01917     }
01918     
01919     strToken = OpenToken( pc, &pc );
01920     if( !strToken.length( ) ) {
01921       delete[] acBuf;
01922       return false; }
01923     iTwo = GetGene( strToken );
01924     
01925     strValue = OpenToken( pc );
01926     if( !strValue.length( ) ) {
01927       if( CMeta::IsNaN( dScore = dDefault ) ) {
01928     delete[] acBuf;
01929     return false; } }
01930     else if( !( dScore = (float)strtod( strValue.c_str( ), &pcTail ) ) &&
01931          ( pcTail != ( strValue.c_str( ) + strValue.length( ) ) ) ) {
01932       delete[] acBuf;
01933       return false; }
01934     
01935     Set(iOne, iTwo, dScore);
01936     
01937   }
01938   delete[] acBuf;
01939   
01940   return true;
01941 }
01942   
01943 }