Sleipnir
|
Encapsulates a PCL (or, in a pinch, CDT) formatted microarray data file. More...
#include <pcl.h>
Public Types | |
enum | ENormalize { ENormalizeNone, ENormalizeZScore, ENormalizeRow, ENormalizeMinMax, ENormalizeColumn, ENormalizeMean, ENormalizeColumnCenter, ENormalizeColumnFraction, EMeanSubtractColumn } |
Ways in which a PCL's data entries can be normalized. More... | |
Public Member Functions | |
CPCL (bool fHeader=true) | |
Create a new PCL object with or without a header row. | |
void | Open (const CPCL &PCL) |
Create a new PCL by copying the given one. | |
void | Open (const std::vector< size_t > &veciGenes, const std::vector< std::string > &vecstrGenes, const std::vector< std::string > &vecstrExperiments) |
Create a new PCL using the given genes, experiments, and gene order. | |
void | Open (const std::vector< std::string > &vecstrGenes, const std::vector< std::string > &vecstrExperiments, const std::vector< std::string > &vecstrFeatures) |
Create a new PCL using the given genes, experiments, and features. | |
bool | OpenBinary (std::istream &istm) |
Load a PCL from the given binary stream. | |
bool | Save (const char *szFile, const std::vector< size_t > *pveciGenes=NULL) const |
Save a PCL to the given text stream. | |
void | Save (std::ostream &ostm, const std::vector< size_t > *pveciGenes=NULL) const |
void | SaveBinary (std::ostream &ostm) const |
Save a PCL to the given binary stream. | |
void | SaveGene (std::ostream &ostm, size_t iGene, size_t iOriginal=-1) const |
Save a single gene row to the given text stream. | |
void | SaveHeader (std::ostream &ostm, bool fCDT=false) const |
Save the PCL's header row to the given text stream. | |
bool | SortGenes (const std::vector< size_t > &veciOrder) |
Reorder the PCL's genes based on the order of the given indices. | |
void | RankTransform () |
Rank transform each row of the PCL in increasing order. | |
bool | AddGenes (const std::vector< std::string > &vecstrGenes) |
Appends new, empty gene rows to the end of the PCL using the given gene IDs. | |
void | Normalize (ENormalize eNormalize=ENormalizeRow) |
Normalizes the PCL's values in the requested manner. | |
void | Impute (size_t iNeighbors, float dMinimumPresent, const CDat &DatSimilarity) |
Impute missing values using the knnimpute algorithm; optionally mask genes with too many missing values. | |
void | Impute (size_t iNeighbors, float dMinimumPresent, const IMeasure *pMeasure, bool fPrecompute=true) |
Impute missing values using the knnimpute algorithm; optionally mask genes with too many missing values. | |
void | MedianMultiples (size_t iSample=100000, size_t iBins=40, float dBinSize=0.25) |
Resolves probesets by averaging agreeing probes and discarding conflicting ones. | |
bool | populate (const char *szFile, float dDefault=HUGE_VAL) |
void | Save (const char *szFile=NULL) |
Save a PCL to the given text file. | |
bool | Open (const char *szFile, size_t iSkip=2, bool fMemmap=false, bool rTable=false) |
Load a PCL from the given text or binary file. | |
bool | Open (std::istream &istm, size_t iSkip, bool rTable=false) |
Load a PCL from the given text stream. | |
bool | Open (std::istream &istm) |
Load a PCL from the given text stream using the default number of skip columns. | |
void | Reset () |
Empties the PCL and deallocates all associated memory. | |
void | Clear () |
Sets all PCL entries to zero (without changing size or memory allocation). | |
size_t | GetFeatures () const |
Return the number of features (skip columns) in the PCL. | |
const std::string & | GetFeature (size_t iFeature) const |
Return the label (header) of the requested feature. | |
const std::string & | GetFeature (size_t iGene, size_t iFeature) const |
Return the requested gene's value for the given feature. | |
void | SetFeature (size_t iGene, size_t iFeature, const std::string &strValue) |
Set the requested gene's value for the given feature. | |
float & | Get (size_t iGene, size_t iExperiment) const |
Returns the value at the requested PCL position. | |
float * | Get (size_t iGene) const |
Return a single gene's row from the PCL. | |
void | Set (size_t iGene, const float *adValues) |
Set a single gene's row in the PCL. | |
const CDataMatrix & | Get () const |
Return the PCL's underlying data matrix. | |
size_t | GetGenes () const |
Return the number of gene rows in the PCL. | |
const std::vector< std::string > & | GetGeneNames () const |
Returns the vector of gene names associated with this PCL. | |
const std::vector< std::string > & | GetExperimentNames () const |
size_t | GetExperiments () const |
Return the number of experiment columns in the PCL. | |
const std::string & | GetGene (size_t iGene) const |
Returns the gene name at the given PCL row. | |
void | SetGene (size_t iGene, const std::string &strGene) |
Set the gene name at the given index. | |
const std::string & | GetExperiment (size_t iExperiment) const |
Returns the experiment label at the given PCL column. | |
size_t | GetExperiment (const std::string &strExperiment) const |
Returns the PCL column of the given experiment label. | |
void | SetExperiment (size_t iExperiment, const std::string &strExperiment) |
Set the experiment label at the given PCL column. | |
void | MaskGene (size_t iGene, bool fMask=true) |
Set the mask value for a given gene row. | |
bool | IsMasked (size_t iGene) const |
Return true if the requested gene index is masked. | |
void | Set (size_t iGene, size_t iExperiment, float dValue) |
Set the value at the requested PCL position. | |
size_t | GetGene (const std::string &strGene) const |
Return the index of the given gene name, or -1 if it is not included in the PCL. | |
std::string | GetFeature (size_t iGene, const char *szFeature) const |
Return the requested gene's value for the given feature. | |
size_t | AddFeature (string strName) |
void | Randomize () |
Randomizes the values in each row of the PCL. | |
Static Public Member Functions | |
static int | Distance (const char *szFile, size_t iSkip, const char *szSimilarityMeasure, bool fNormalize, bool fZScore, bool fAutocorrelate, const char *szGeneFile, float dCutoff, size_t iLimit, CPCL &PCL, CDat &Dat, IMeasure::EMap eMap=IMeasure::EMapCenter, bool fFrequencyWeight=false, float dAlpha=0, int nThreads=1) |
Kitchen sink method for completely loading a PCL and calculating its pairwise similarity scores into a CDat. | |
static int | Distance (const char *szFile, size_t iSkip, const char *szWeights, const char *szSimilarityMeasure, bool fNormalize, bool fZScore, bool fAutocorrelate, const char *szGeneFile, float dCutoff, size_t iLimit, CPCL &PCL, CDat &Dat, IMeasure::EMap eMap=IMeasure::EMapCenter, bool fFrequencyWeight=false, float dAlpha=0, int nThreads=1) |
Kitchen sink method for completely loading a PCL and calculating its pairwise similarity scores into a CDat. This method supports weighted metrics as opposed to the above method which does not. This allows distancer to take a weights vector (currently included in the documentation but it was not previously functioning). The weights vector is used only if freqweight is false. | |
static size_t | GetSkip () |
Return the default number of skip columns between the gene IDs and experimental values. | |
static const char * | GetExtension () |
Returns the standard extension (including period) for PCL files. |
Encapsulates a PCL (or, in a pinch, CDT) formatted microarray data file.
A canonical PCL is a tab-delimited text format used to store microarray datasets. Each row represents a single gene, each column represents a single microarray, and each cell stores the gene's expression value for that condition. There are usually two header rows, one naming each column and one listing the microarray conditions' weights. There are usually three header columns: unique gene identifiers in the first column, human-readable gene name synonyms in the second column, and gene weights in the third. Thus, a simple PCL might be:
GID NAME GWEIGHT Condition 1 Condition 2 Condition 3 EWEIGHT 1 1 1 YKL032C IXR1 1 -0.02 0.03 -0.1 YMR027W HRT2 1 -0.018 0.013 -0.016 YLR220W CCC1 1 -0.0050 -0.01 0.015</pre>
In practice, the format of this file is very flexible: the first row's always a header, the first column's always unique gene IDs, and most other columns are data, but everything else changes. There can be an arbitrary number of feature columns between the gene IDs and the first experimental column (Sleipnir refers to these as "skip" columns). The EWEIGHT row may or may not be present. Missing values might be notated by empty cells or the string NA. CPCL will correctly parse all of these cases.
So in practice, a PCL is a full matrix of data in which each row represents a gene and each column an experiment. Associated with each gene are zero or more features, possibly including the name and weight; associated with each experiment is a name (drawn from the header row). The EWEIGHT row is discarded. More generally, a PCL can be used to hold any data in which vectors of values are associated with a set of genes.
Ways in which a PCL's data entries can be normalized.
Sleipnir::CPCL::CPCL | ( | bool | fHeader = true | ) | [inline] |
bool Sleipnir::CPCL::AddGenes | ( | const std::vector< std::string > & | vecstrGenes | ) |
Appends new, empty gene rows to the end of the PCL using the given gene IDs.
vecstrGenes | Gene names to be appended to the PCL. |
Definition at line 1415 of file pcl.cpp.
References Sleipnir::CFullMatrix< tType >::AddRows(), Sleipnir::CFullMatrix< tType >::GetColumns(), Sleipnir::CMeta::GetNaN(), Sleipnir::CFullMatrix< tType >::GetRows(), Sleipnir::CFullMatrix< tType >::Set(), and SetGene().
Referenced by MedianMultiples().
int Sleipnir::CPCL::Distance | ( | const char * | szFile, |
size_t | iSkip, | ||
const char * | szSimilarityMeasure, | ||
bool | fNormalize, | ||
bool | fZScore, | ||
bool | fAutocorrelate, | ||
const char * | szGeneFile, | ||
float | dCutoff, | ||
size_t | iLimit, | ||
CPCL & | PCL, | ||
CDat & | Dat, | ||
IMeasure::EMap | eMap = IMeasure::EMapCenter , |
||
bool | fFrequencyWeight = false , |
||
float | dAlpha = 0 , |
||
int | nThreads = 1 |
||
) | [static] |
Kitchen sink method for completely loading a PCL and calculating its pairwise similarity scores into a CDat.
szFile | If non-null, file from which PCL is to be loaded; if null, standard input is used. |
iSkip | Number of columns to skip in the PCL file between gene IDs and experimental data. |
szSimilarityMeasure | String identifier of similarity measure to use for CDat generation. |
fNormalize | If true, normalize the generated CDat to the range [0, 1]. |
fZScore | If true, normalize the generated CDat to z-scores (subtract mean, divide by standard deviation). |
fAutocorrelate | If true, autocorrelate the requested similarity measure. |
szGeneFile | If non-null, only convert genes in the given file to pairwise scores in the CDat. |
dCutoff | If finite, remove all pairwise scores less than the given cutoff. |
iLimit | If not equal to -1 and the PCL contains more genes than this limit, do not precalculate pairwise scores; instead, configure the CDat to calculate scores on the fly as needed from the PCL. |
PCL | Output PCL with the loaded data. |
Dat | Output CDat with the calculated pairwise scores. |
eMap | Way in which returned value should be centered (implementation-specific). |
fFrequencyWeight | If true, weight each condition by the frequency with which it is nonzero over all genes. |
The (many) steps performed by this method are as follows:
Definition at line 165 of file pcl.cpp.
References Sleipnir::IMeasure::Clone(), Sleipnir::CDat::ENormalizeMinMax, Sleipnir::CDat::ENormalizeZScore, Sleipnir::CDat::Get(), Get(), GetExperiments(), GetGene(), Sleipnir::CGenes::GetGene(), Sleipnir::CGenes::GetGeneNames(), GetGeneNames(), Sleipnir::CDat::GetGenes(), GetGenes(), Sleipnir::CGenes::GetGenes(), Sleipnir::CGene::GetName(), Sleipnir::IMeasure::GetName(), Sleipnir::CMeta::GetNaN(), Sleipnir::CMeta::IsNaN(), Sleipnir::IMeasure::IsRank(), Sleipnir::IMeasure::Measure(), Sleipnir::CDat::Normalize(), Open(), Sleipnir::CDat::Open(), Sleipnir::CGenes::Open(), RankTransform(), and Sleipnir::CDat::Set().
int Sleipnir::CPCL::Distance | ( | const char * | szFile, |
size_t | iSkip, | ||
const char * | szWeights, | ||
const char * | szSimilarityMeasure, | ||
bool | fNormalize, | ||
bool | fZScore, | ||
bool | fAutocorrelate, | ||
const char * | szGeneFile, | ||
float | dCutoff, | ||
size_t | iLimit, | ||
CPCL & | PCL, | ||
CDat & | Dat, | ||
IMeasure::EMap | eMap = IMeasure::EMapCenter , |
||
bool | fFrequencyWeight = false , |
||
float | dAlpha = 0 , |
||
int | nThreads = 1 |
||
) | [static] |
Kitchen sink method for completely loading a PCL and calculating its pairwise similarity scores into a CDat. This method supports weighted metrics as opposed to the above method which does not. This allows distancer to take a weights vector (currently included in the documentation but it was not previously functioning). The weights vector is used only if freqweight is false.
szFile | If non-null, file from which PCL is to be loaded; if null, standard input is used. |
iSkip | Number of columns to skip in the PCL file between gene IDs and experimental data. |
szWeights | If non-null, file from which weights are to be loaded; if null, this is ignored. |
szSimilarityMeasure | String identifier of similarity measure to use for CDat generation. |
fNormalize | If true, normalize the generated CDat to the range [0, 1]. |
fZScore | If true, normalize the generated CDat to z-scores (subtract mean, divide by standard deviation). |
fAutocorrelate | If true, autocorrelate the requested similarity measure. |
szGeneFile | If non-null, only convert genes in the given file to pairwise scores in the CDat. |
dCutoff | If finite, remove all pairwise scores less than the given cutoff. |
iLimit | If not equal to -1 and the PCL contains more genes than this limit, do not precalculate pairwise scores; instead, configure the CDat to calculate scores on the fly as needed from the PCL. |
PCL | Output PCL with the loaded data. |
Dat | Output CDat with the calculated pairwise scores. |
eMap | Way in which returned value should be centered (implementation-specific). |
fFrequencyWeight | If true, weight each condition by the frequency with which it is nonzero over all genes. |
The (many) steps performed by this method are as follows:
Definition at line 426 of file pcl.cpp.
References Sleipnir::CMeta::c_szWS, Sleipnir::IMeasure::Clone(), Sleipnir::CDat::ENormalizeMinMax, Sleipnir::CDat::ENormalizeZScore, Sleipnir::CDat::Get(), Get(), GetExperiments(), GetGene(), Sleipnir::CGenes::GetGene(), Sleipnir::CGenes::GetGeneNames(), GetGeneNames(), Sleipnir::CDat::GetGenes(), GetGenes(), Sleipnir::CGenes::GetGenes(), Sleipnir::CGene::GetName(), Sleipnir::IMeasure::GetName(), Sleipnir::CMeta::GetNaN(), Sleipnir::CMeta::IsNaN(), Sleipnir::IMeasure::IsRank(), Sleipnir::IMeasure::Measure(), Sleipnir::CDat::Normalize(), Open(), Sleipnir::CDat::Open(), Sleipnir::CGenes::Open(), RankTransform(), Sleipnir::CDat::Set(), and Sleipnir::CMeta::Tokenize().
float& Sleipnir::CPCL::Get | ( | size_t | iGene, |
size_t | iExperiment | ||
) | const [inline] |
Returns the value at the requested PCL position.
iGene | Gene row. |
iExperiment | Experiment column. |
Definition at line 366 of file pcl.h.
References Sleipnir::CFullMatrix< tType >::Get().
Referenced by Distance(), Sleipnir::CBayesNetSmile::Evaluate(), Sleipnir::CPCLSet::Get(), Sleipnir::CDatasetCompact::Open(), Sleipnir::CCoalesce::SetSeed(), and Sleipnir::CCoalesceCluster::Subtract().
float* Sleipnir::CPCL::Get | ( | size_t | iGene | ) | const [inline] |
Return a single gene's row from the PCL.
iGene | PCL row. |
Definition at line 388 of file pcl.h.
References Sleipnir::CFullMatrix< tType >::Get().
const CDataMatrix& Sleipnir::CPCL::Get | ( | ) | const [inline] |
Return the PCL's underlying data matrix.
Definition at line 422 of file pcl.h.
Referenced by Impute(), MedianMultiples(), Normalize(), Randomize(), RankTransform(), SaveBinary(), and SaveGene().
const std::string& Sleipnir::CPCL::GetExperiment | ( | size_t | iExperiment | ) | const [inline] |
Returns the experiment label at the given PCL column.
iExperiment | Index of experiment label to return. |
Definition at line 527 of file pcl.h.
Referenced by Sleipnir::CCoalesce::Cluster(), Sleipnir::CBayesNetSmile::Evaluate(), Sleipnir::CCoalesceCluster::Open(), Sleipnir::CCoalesceCluster::Save(), and SaveBinary().
size_t Sleipnir::CPCL::GetExperiment | ( | const std::string & | strExperiment | ) | const [inline] |
Returns the PCL column of the given experiment label.
strExperiment | Experimental label of index to return. |
size_t Sleipnir::CPCL::GetExperiments | ( | ) | const [inline] |
Return the number of experiment columns in the PCL.
Definition at line 470 of file pcl.h.
Referenced by Sleipnir::CCoalesce::Cluster(), Distance(), Sleipnir::CBayesNetSmile::Evaluate(), Impute(), MedianMultiples(), Normalize(), Sleipnir::CCoalesceCluster::Open(), Sleipnir::CPCLPair::Open(), Open(), Sleipnir::CDatasetCompact::Open(), OpenBinary(), Randomize(), RankTransform(), Sleipnir::CCoalesceCluster::Save(), SaveBinary(), and Sleipnir::CCoalesce::SetSeed().
static const char* Sleipnir::CPCL::GetExtension | ( | ) | [inline, static] |
Returns the standard extension (including period) for PCL files.
Definition at line 152 of file pcl.h.
Referenced by Sleipnir::CCoalesceCluster::Open(), and Sleipnir::CCoalesceCluster::Save().
const std::string& Sleipnir::CPCL::GetFeature | ( | size_t | iFeature | ) | const [inline] |
Return the label (header) of the requested feature.
iFeature | Index of the feature label to retrieve. |
Definition at line 297 of file pcl.h.
Referenced by GetFeature(), and SaveBinary().
const std::string& Sleipnir::CPCL::GetFeature | ( | size_t | iGene, |
size_t | iFeature | ||
) | const [inline] |
Return the requested gene's value for the given feature.
iGene | Gene for which feature value should be returned. |
iFeature | Index of feature value to be retrieved. |
std::string Sleipnir::CPCL::GetFeature | ( | size_t | iGene, |
const char * | szFeature | ||
) | const [inline] |
Return the requested gene's value for the given feature.
iGene | Gene for which feature value should be returned. |
szFeature | Label of feature for which value should be retrieved. |
Definition at line 683 of file pcl.h.
References GetFeature().
size_t Sleipnir::CPCL::GetFeatures | ( | ) | const [inline] |
Return the number of features (skip columns) in the PCL.
Definition at line 279 of file pcl.h.
Referenced by SaveBinary().
const std::string& Sleipnir::CPCL::GetGene | ( | size_t | iGene | ) | const [inline] |
Returns the gene name at the given PCL row.
iGene | Index of gene name to return. |
Definition at line 488 of file pcl.h.
Referenced by Sleipnir::CCoalesce::Cluster(), Distance(), Sleipnir::CBayesNetSmile::Evaluate(), MedianMultiples(), Sleipnir::CPCLSet::Open(), Sleipnir::CCoalesceCluster::Open(), Open(), Sleipnir::CDatasetCompact::Open(), populate(), Sleipnir::CCoalesceCluster::Save(), SaveBinary(), SaveGene(), and SortGenes().
size_t Sleipnir::CPCL::GetGene | ( | const std::string & | strGene | ) | const [inline] |
Return the index of the given gene name, or -1 if it is not included in the PCL.
strGene | Gene name to retrieve. |
const std::vector<std::string>& Sleipnir::CPCL::GetGeneNames | ( | ) | const [inline] |
Returns the vector of gene names associated with this PCL.
Definition at line 452 of file pcl.h.
Referenced by Distance(), Impute(), and Sleipnir::CDatasetCompact::Open().
size_t Sleipnir::CPCL::GetGenes | ( | ) | const [inline] |
Return the number of gene rows in the PCL.
Definition at line 437 of file pcl.h.
Referenced by Sleipnir::CCoalesce::Cluster(), Distance(), Sleipnir::CSVM::Evaluate(), Sleipnir::CBayesNetSmile::Evaluate(), Impute(), Sleipnir::CCoalesceCluster::Initialize(), MedianMultiples(), Normalize(), Sleipnir::CPCLSet::Open(), Sleipnir::CCoalesceCluster::Open(), Open(), Sleipnir::CDatasetCompact::Open(), OpenBinary(), Randomize(), RankTransform(), Sleipnir::CCoalesceCluster::Save(), SaveBinary(), Sleipnir::CCoalesceCluster::SelectGenes(), and SortGenes().
static size_t Sleipnir::CPCL::GetSkip | ( | ) | [inline, static] |
void Sleipnir::CPCL::Impute | ( | size_t | iNeighbors, |
float | dMinimumPresent, | ||
const CDat & | DatSimilarity | ||
) |
Impute missing values using the knnimpute algorithm; optionally mask genes with too many missing values.
iNeighbors | Number of nearest neighbors to use for imputation. |
dMinimumPresent | Fraction of conditions that must be present; genes with fewer than this many values present will be masked instead of imputed. |
DatSimilarity | CDat similarity matrix from which nearest neighbors are determined. |
Imputes missing values in the PCL using the knnimpute algorithm of Troyanskaya et al, Bioinformatics 2001. Briefly, for each gene with missing values, the k nearest neighbors (most similar genes) are found, and the missing value is replaced with a weighted average of the values in the neighbors. Optionally, Impute can also completely mask genes that are missing too many values; genes with less than the requested fraction of data present will be completely masked rather than imputed.
Definition at line 1600 of file pcl.cpp.
References Sleipnir::CFullMatrix< tType >::Get(), Sleipnir::CDat::Get(), Get(), GetExperiments(), GetGenes(), Sleipnir::CMeta::GetNaN(), Sleipnir::CMeta::IsNaN(), MaskGene(), and Set().
Referenced by Impute().
void Sleipnir::CPCL::Impute | ( | size_t | iNeighbors, |
float | dMinimumPresent, | ||
const IMeasure * | pMeasure, | ||
bool | fPrecompute = true |
||
) |
Impute missing values using the knnimpute algorithm; optionally mask genes with too many missing values.
iNeighbors | Number of nearest neighbors to use for imputation. |
dMinimumPresent | Fraction of conditions that must be present; genes with fewer than this many values present will be masked instead of imputed. |
pMeasure | Similarity measure with which nearest neighbors are determined. |
fPrecompute | If true, precompute and cache in memory all pairwise similarities; otherwise, calculate these on the fly from values in the PCL. |
Imputes missing values in the PCL using the knnimpute algorithm of Troyanskaya et al, Bioinformatics 2001. Briefly, for each gene with missing values, the k nearest neighbors (most similar genes) are found, and the missing value is replaced with a weighted average of the values in the neighbors. Optionally, Impute can also completely mask genes that are missing too many values; genes with less than the requested fraction of data present will be completely masked rather than imputed.
Definition at line 1701 of file pcl.cpp.
References Get(), GetExperiments(), GetGeneNames(), Sleipnir::CDat::GetGenes(), Impute(), Sleipnir::IMeasure::Measure(), Sleipnir::CDat::Open(), and Sleipnir::CDat::Set().
bool Sleipnir::CPCL::IsMasked | ( | size_t | iGene | ) | const [inline] |
Return true if the requested gene index is masked.
iGene | Gene row to test for masking. |
Definition at line 616 of file pcl.h.
Referenced by Sleipnir::CSVM::Evaluate(), and SaveBinary().
void Sleipnir::CPCL::MaskGene | ( | size_t | iGene, |
bool | fMask = true |
||
) | [inline] |
Set the mask value for a given gene row.
iGene | Index of gene row to be masked or unmasked. |
fMask | If true, mask given gene; if false, unmask it. |
PCLs allow individual gene rows to be masked. This generally excludes those rows from processing, particularly from any saved output or from use in any machine learning algorithm consuming a PCL.
Definition at line 592 of file pcl.h.
Referenced by Impute(), MedianMultiples(), and Sleipnir::CCoalesceCluster::Save().
void Sleipnir::CPCL::MedianMultiples | ( | size_t | iSample = 100000 , |
size_t | iBins = 40 , |
||
float | dBinSize = 0.25 |
||
) |
Resolves probesets by averaging agreeing probes and discarding conflicting ones.
iSample | Number of random samples to generate for probeset null distribution. |
iBins | Number of bins for probeset distribution histogram. |
dBinSize | Size of bins for probeset distribution histogram in z-score units. |
Definition at line 1733 of file pcl.cpp.
References AddGenes(), Sleipnir::IMeasure::EMapNone, Get(), GetExperiments(), GetGene(), GetGenes(), Sleipnir::CMeta::GetNaN(), Sleipnir::CMeta::IsNaN(), MaskGene(), Sleipnir::CMeasureEuclidean::Measure(), and Set().
void Sleipnir::CPCL::Normalize | ( | ENormalize | eNormalize = ENormalizeRow | ) |
Normalizes the PCL's values in the requested manner.
eNormalize | Algorithm by which the PCL's values should be normalized. |
Definition at line 1452 of file pcl.cpp.
References Sleipnir::CStatistics::Average(), ENormalizeColumn, ENormalizeColumnCenter, ENormalizeColumnFraction, ENormalizeMean, ENormalizeMinMax, ENormalizeRow, ENormalizeZScore, Get(), GetExperiments(), GetGenes(), Sleipnir::CMeta::IsNaN(), Set(), and Sleipnir::CStatistics::Variance().
Referenced by Sleipnir::CCoalesceCluster::Initialize(), and Sleipnir::CPCLSet::Open().
void Sleipnir::CPCL::Open | ( | const CPCL & | PCL | ) |
Create a new PCL by copying the given one.
PCL | PCL to be copied into the current one. |
Definition at line 774 of file pcl.cpp.
References Sleipnir::CFullMatrix< tType >::Get(), Sleipnir::CFullMatrix< tType >::GetColumns(), Sleipnir::CFullMatrix< tType >::GetRows(), Sleipnir::CFullMatrix< tType >::Initialize(), Reset(), and Sleipnir::CFullMatrix< tType >::Set().
Referenced by Sleipnir::CCoalesce::Cluster(), Distance(), Sleipnir::CCoalesceCluster::Initialize(), Sleipnir::CCoalesceCluster::Open(), Sleipnir::CPCLPair::Open(), Open(), Sleipnir::CDatasetCompact::Open(), and Sleipnir::CCoalesceCluster::Save().
void Sleipnir::CPCL::Open | ( | const std::vector< size_t > & | veciGenes, |
const std::vector< std::string > & | vecstrGenes, | ||
const std::vector< std::string > & | vecstrExperiments | ||
) |
Create a new PCL using the given genes, experiments, and gene order.
veciGenes | Order in which genes will be placed in the new PCL. |
vecstrGenes | Gene IDs to be used in the new PCL. |
vecstrExperiments | Experiment labels to be used in the new PCL. |
This creates an empty PCL (all entries missing) with the requested number of gene rows and experiment columns; the given gene IDs and experiment labels are inserted into the appropriate header rows/columns. However, the gene order will be the order of indices given in veciGenes. For example, if veciGenes is [1, 0, 2] and vecstrGenes contains [A, B, C], the order of genes within the new PCL will be [B, A, C]. Experiment order is unaffected and will be as given in vecstrExperiments.
Definition at line 1291 of file pcl.cpp.
References Sleipnir::CFullMatrix< tType >::GetColumns(), GetExperiments(), GetGenes(), Sleipnir::CMeta::GetNaN(), Sleipnir::CFullMatrix< tType >::GetRows(), Sleipnir::CFullMatrix< tType >::Initialize(), Reset(), and Sleipnir::CFullMatrix< tType >::Set().
void Sleipnir::CPCL::Open | ( | const std::vector< std::string > & | vecstrGenes, |
const std::vector< std::string > & | vecstrExperiments, | ||
const std::vector< std::string > & | vecstrFeatures | ||
) |
Create a new PCL using the given genes, experiments, and features.
vecstrGenes | Gene IDs to be used in the new PCL. |
vecstrExperiments | Experiment labels to be used in the new PCL. |
vecstrFeatures | Feature labels to be used in the new PCL (possibly empty). |
This creates an empty PCL (all entries missing) with the requested number of gene rows and experiment columns; the given gene IDs and experiment labels are inserted into the appropriate header rows/columns. The PCL will contain the given number of feature columns between the gene IDs and experimental values, the values for which will be initialized to empty strings.
Definition at line 1236 of file pcl.cpp.
References Sleipnir::CFullMatrix< tType >::GetColumns(), GetExperiments(), GetGenes(), Sleipnir::CMeta::GetNaN(), Sleipnir::CFullMatrix< tType >::GetRows(), Sleipnir::CFullMatrix< tType >::Initialize(), Reset(), Sleipnir::CFullMatrix< tType >::Set(), SetExperiment(), and SetGene().
bool Sleipnir::CPCL::Open | ( | const char * | szFile, |
size_t | iSkip = 2 , |
||
bool | Memmap = false , |
||
bool | rTable = false |
||
) |
Load a PCL from the given text or binary file.
Create a new PCL by reading from text or binary.
szFile | File from which PCL file is loaded. |
iSkip | Number of feature columns to skip between the gene IDs and first experimental column. |
rTable | Is this PCL file generated by R language (This means its missing the "YORF" label in the first line, thus has 1 less token compared to sleipnir PCL format) |
szFile | File from which PCL file is loaded. |
Definition at line 678 of file pcl.cpp.
References Sleipnir::CDat::Get(), Sleipnir::CDat::GetGeneNames(), Sleipnir::CDat::GetGenes(), Sleipnir::CMeta::MapRead(), Open(), Sleipnir::CDat::Open(), OpenBinary(), Reset(), and Set().
bool Sleipnir::CPCL::Open | ( | std::istream & | istm, |
size_t | iSkip, | ||
bool | rTable = false |
||
) |
Load a PCL from the given text stream.
istm | Stream from which PCL file is loaded. |
iSkip | Number of feature columns to skip between the gene IDs and first experimental column. |
rTable | Is this PCL file generated by R language (This means its missing the "YORF" label in the first line, thus has 1 less token compared to sleipnir PCL format) |
Definition at line 829 of file pcl.cpp.
References Sleipnir::CFullMatrix< tType >::GetColumns(), GetExperiments(), GetGene(), GetGenes(), Sleipnir::CFullMatrix< tType >::GetRows(), Sleipnir::CFullMatrix< tType >::Initialize(), and Sleipnir::CFullMatrix< tType >::Set().
bool Sleipnir::CPCL::Open | ( | std::istream & | istm | ) | [inline] |
bool Sleipnir::CPCL::OpenBinary | ( | std::istream & | istm | ) |
Load a PCL from the given binary stream.
istm | Stream from which PCL file is loaded. |
Definition at line 1184 of file pcl.cpp.
References Sleipnir::CFullMatrix< tType >::Get(), GetExperiments(), GetGenes(), Sleipnir::CFullMatrix< tType >::GetRows(), Sleipnir::CFullMatrix< tType >::Initialize(), and Reset().
Referenced by Open().
bool Sleipnir::CPCL::populate | ( | const char * | szFile, |
float | dDefault = HUGE_VAL |
||
) |
Fills a PCL object from a tab text file (gene1 gene2 val) gene1 is row, gene2 is col used for creating a directed gene-gene network
Definition at line 1895 of file pcl.cpp.
References GetGene(), Sleipnir::CMeta::IsNaN(), and Set().
void Sleipnir::CPCL::Randomize | ( | ) | [inline] |
Randomizes the values in each row of the PCL.
Randomly shuffles each gene's vector of values.
Definition at line 707 of file pcl.h.
References Get(), GetExperiments(), and GetGenes().
void Sleipnir::CPCL::RankTransform | ( | ) |
Rank transform each row of the PCL in increasing order.
Replaces all values in the PCL with their increasing ranks by row. For example, a row containing [-0.1, 0.1, 0.3] would be replaced with [0, 1, 2]; a row containing [0.5, 0.4, 0.3] would be replaced with [2, 1, 0].
Definition at line 1369 of file pcl.cpp.
References Get(), GetExperiments(), GetGenes(), Sleipnir::CMeta::IsNaN(), and Set().
Referenced by Distance(), and Sleipnir::CDatasetCompact::Open().
bool Sleipnir::CPCL::Save | ( | const char * | szFile, |
const std::vector< size_t > * | pveciGenes = NULL |
||
) | const |
Save a PCL to the given text stream.
ostm | Stream into which PCL file is saved. |
pveciGenes | If non-null, generate an initial CDT GENE column using the given original gene indices. |
If called without a vector of gene indices, saves a standard PCL to the given text stream using the CPCL's current gene ID list, features, experiments, and data. If a vector of gene indices is provided, the output will instead be in CDT format, with an extra initial column of IDs of the form "GENE###", where the number indicates the gene's original index in the pre-clustered file.
Definition at line 1109 of file pcl.cpp.
References SaveBinary().
Referenced by Sleipnir::CCoalesceCluster::Save(), and Save().
void Sleipnir::CPCL::Save | ( | const char * | szFile = NULL | ) |
Save a PCL to the given text file.
szFile | File into which PCL file is saved. |
Definition at line 748 of file pcl.cpp.
References Save(), and SaveBinary().
void Sleipnir::CPCL::SaveBinary | ( | std::ostream & | ostm | ) | const |
Save a PCL to the given binary stream.
ostm | Stream into which PCL file is saved. |
Definition at line 1147 of file pcl.cpp.
References Get(), GetExperiment(), GetExperiments(), GetFeature(), GetFeatures(), GetGene(), GetGenes(), and IsMasked().
Referenced by Save().
void Sleipnir::CPCL::SaveGene | ( | std::ostream & | ostm, |
size_t | iGene, | ||
size_t | iOriginal = -1 |
||
) | const |
Save a single gene row to the given text stream.
ostm | Stream into which gene row is saved. |
iGene | Gene index to be saved. |
iOriginal | If not equal to -1, generate an initial CDT GENE ID using the given original gene index. |
If called with iGene set to -1, saves a single row of a standard PCL to the given text stream using the CPCL's current gene ID, features, and values for the row. If a gene index is provided, the output will instead be in CDT format, with an extra initial column of IDs of the form "GENE###", where the number indicates the gene's original index in the pre-clustered file.
Definition at line 1070 of file pcl.cpp.
References Get(), GetGene(), and Sleipnir::CMeta::IsNaN().
void Sleipnir::CPCL::SaveHeader | ( | std::ostream & | ostm, |
bool | fCDT = false |
||
) | const |
Save the PCL's header row to the given text stream.
ostm | Stream into which PCL header is saved. |
fCDT | If true, generate an initial CDT GENE column header. |
If called with fCDT false, saves standard PCL header and EWEIGHT rows to the given text stream using the CPCL's current feature and experimental headers. If fCDT is true, the output will instead be in CDT format, with an extra initial column header for gene index identifiers.
void Sleipnir::CPCL::Set | ( | size_t | iGene, |
const float * | adValues | ||
) | [inline] |
Set a single gene's row in the PCL.
iGene | PCL row. |
adValues | Values to which row is set. |
Definition at line 410 of file pcl.h.
References Sleipnir::CFullMatrix< tType >::Set().
Referenced by Sleipnir::CBayesNetSmile::Evaluate(), Impute(), MedianMultiples(), Normalize(), Open(), populate(), and RankTransform().
void Sleipnir::CPCL::Set | ( | size_t | iGene, |
size_t | iExperiment, | ||
float | dValue | ||
) | [inline] |
Set the value at the requested PCL position.
iGene | PCL row. |
iExperiment | PCL column. |
dValue | Value to store. |
Definition at line 641 of file pcl.h.
References Sleipnir::CFullMatrix< tType >::Set().
void Sleipnir::CPCL::SetExperiment | ( | size_t | iExperiment, |
const std::string & | strExperiment | ||
) | [inline] |
Set the experiment label at the given PCL column.
iExperiment | Index of experiment label to set. |
strExperiment | Experiment label to which the requested column is set. |
Definition at line 568 of file pcl.h.
Referenced by Open(), and Sleipnir::CCoalesceCluster::Save().
void Sleipnir::CPCL::SetFeature | ( | size_t | iGene, |
size_t | iFeature, | ||
const std::string & | strValue | ||
) | [inline] |
Set the requested gene's value for the given feature.
iGene | Gene for which feature value should be set. |
iFeature | Index of feature value to be set. |
strValue | Feature value to set for the given gene and feature indices. |
void Sleipnir::CPCL::SetGene | ( | size_t | iGene, |
const std::string & | strGene | ||
) | [inline] |
Set the gene name at the given index.
iGene | Row of gene name to modify. |
strGene | Gene name to store at the requested index. |
Reimplemented from Sleipnir::CPCLImpl.
Definition at line 509 of file pcl.h.
Referenced by AddGenes(), Open(), and Sleipnir::CDat::SetGene().
bool Sleipnir::CPCL::SortGenes | ( | const std::vector< size_t > & | veciOrder | ) |
Reorder the PCL's genes based on the order of the given indices.
veciOrder | Index order in which genes should be placed. |
Reorders the PCL's gene rows based on the given indices. For example, if the current row order is [A, B, C] and the given indices are [1, 0, 2], the new row order will be [B, A, C].
Definition at line 1342 of file pcl.cpp.
References Sleipnir::CFullMatrix< tType >::Get(), GetGene(), GetGenes(), Sleipnir::CFullMatrix< tType >::GetRows(), and Sleipnir::CMeta::Permute().