Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "libsvm.h" 00024 #include "pclset.h" 00025 #include "dataset.h" 00026 #include "meta.h" 00027 #include "genome.h" 00028 #include "compactmatrix.h" 00029 #include <vector> 00030 #include <set> 00031 00032 #define Malloc(type,n) (type *)malloc((n)*sizeof(type)) 00033 00034 //#include <libsvm.h> 00035 00036 namespace LIBSVM { 00037 00038 #include "libsvm.h" 00039 struct svm_node* CLIBSVM::x_space = NULL; 00040 00041 bool CLIBSVM::initialize() { 00042 00043 /* set default */ 00044 00045 parm.cache_size = 100; 00046 parm.C = 0.01; 00047 parm.eps = 1e-3; 00048 parm.svm_type = C_SVC; 00049 parm.p = 0.1; 00050 parm.shrinking = 1; 00051 parm.nr_weight = 0; 00052 parm.weight_label = NULL; 00053 parm.weight = NULL; 00054 parm.probability = 0; 00055 parm.nu = 0.5; 00056 parm.coef0 = 0; 00057 parm.gamma = 0; 00058 parm.degree = 3; 00059 parm.kernel_type = LINEAR; 00060 00061 model = NULL; 00062 00063 balance = 0;//Off 00064 00065 return true; 00066 } 00067 00068 bool CLIBSVM::parms_check() { 00069 if (parm.C < 0) { 00070 fprintf( 00071 stderr, 00072 "\nTrade-off between training error and margin is not set (C<0)!\nC value will be set to default value. Clight = Cpef * 100 / n \n"); 00073 fprintf(stderr, "be less than 1.0 !!!\n\n"); 00074 return false; 00075 } 00076 if (parm.eps <= 0) { 00077 fprintf(stderr, 00078 "\nThe epsilon parameter must be greater than zero!\n\n"); 00079 return false; 00080 } 00081 00082 if (parm.nu < 0 | parm.nu > 1) { 00083 fprintf(stderr, "nu parameter must be between 0 and 1"); 00084 return false; 00085 } 00086 00087 //TODO: add more parameter checks 00088 00089 return true; 00090 } 00091 00092 void CLIBSVM::SetXSpace(Sleipnir::CPCL& PCL) { 00093 size_t j, k, iGene, numFeatures, numLabels; 00094 std::vector<std::string> vecGeneNames; 00095 string GeneName; 00096 float d; 00097 00098 numFeatures = PCL.GetExperiments(); 00099 numLabels = PCL.GetGenes(); 00100 vecGeneNames = PCL.GetGeneNames(); 00101 00102 cerr << "total data" << endl; 00103 cerr << "number of features (columns) used: " << numFeatures << endl; 00104 cerr << "number of labels in data: " << numLabels << endl; 00105 cerr << "number of gene (rows) names: " << vecGeneNames.size() << endl; 00106 00107 x_space = Malloc(struct svm_node, (1+numFeatures) * numLabels); 00108 00109 j = 0;//element index 00110 00111 for ( std::vector<std::string>::iterator it = vecGeneNames.begin(); it != vecGeneNames.end(); ++it) { 00112 GeneName = *it; 00113 iGene = PCL.GetGene(GeneName); 00114 00115 for ( k = 0; k < numFeatures; k++){ 00116 x_space[j].index = k; 00117 if (!Sleipnir::CMeta::IsNaN(d = PCL.Get(iGene, k))) { 00118 x_space[j].value = d; 00119 }else{ 00120 // impute 0 for missing values 00121 x_space[j].value = 0; 00122 } 00123 j++; 00124 } 00125 00126 x_space[j].index = -1; 00127 j++; 00128 } 00129 00130 00131 } 00132 00133 SAMPLE * CLIBSVM::CreateSample(Sleipnir::CPCL& PCL, vector<SVMLabel> SVMLabels) { 00134 size_t i, j, k, s, iGene, iProblem, numFeatures, numLabels, max_index; 00135 float d; 00136 00137 struct svm_problem* prob; 00138 00139 prob = Malloc(struct svm_problem,1); 00140 00141 numFeatures = PCL.GetExperiments(); 00142 numLabels = 0; 00143 00144 iProblem = 0; 00145 00146 for (i = 0; i < SVMLabels.size(); i++) { 00147 if (!SVMLabels[i].hasIndex){ 00148 SVMLabels[i].SetIndex(PCL.GetGene(SVMLabels[i].GeneName)); 00149 } 00150 iGene = SVMLabels[i].index; 00151 if (iGene != -1) { 00152 numLabels++; 00153 } 00154 } 00155 00156 cerr << "sampled data" << endl; 00157 cerr << "number of features used: " << numFeatures << endl; 00158 cerr << "number of labels given: " << SVMLabels.size() << endl; 00159 cerr << "number of labels in data: " << numLabels << endl; 00160 00161 prob->l = numLabels; 00162 prob->y = Malloc(double,numLabels); 00163 prob->x = Malloc(struct svm_node *, numLabels); 00164 00165 if(x_space == NULL) { 00166 SetXSpace(PCL); 00167 } 00168 00169 max_index = numFeatures; 00170 00171 s = 0;//sample index 00172 for (i = 0; i < SVMLabels.size(); i++) { 00173 iGene = SVMLabels[i].index; 00174 00175 if (iGene != -1){ 00176 (prob->x)[s] = &x_space[iGene*(1+numFeatures)]; 00177 (prob->y)[s] = SVMLabels[i].Target; 00178 s++; 00179 } 00180 } 00181 00182 SAMPLE* pSample = new SAMPLE; 00183 00184 pSample->n = prob->l;//number of labels 00185 pSample->problems = prob; 00186 pSample->numFeatures = numFeatures; 00187 return pSample; 00188 } 00189 00190 //TODO: create sample for dab/dat files 00191 // 00192 00193 vector<Result> CLIBSVM::Classify(Sleipnir::CPCL &PCL, 00194 vector<SVMLabel> SVMLabels) { 00195 size_t i, j, iGene; 00196 double predict_label; 00197 double* dec_values; 00198 double dec_value; 00199 struct svm_node *x; 00200 00201 SAMPLE* pSample; 00202 vector<Result> vecResult; 00203 00204 pSample = CLIBSVM::CreateSample(PCL, SVMLabels); 00205 00206 int svm_type = svm_get_svm_type(model); 00207 int nr_class = svm_get_nr_class(model); 00208 00209 dec_values = Malloc(double, nr_class*(nr_class-1)/2); 00210 vecResult.resize(pSample->n); 00211 00212 j= 0; //pSample index 00213 for(i = 0 ; i < SVMLabels.size() ; i++){ 00214 if(!SVMLabels[i].hasIndex){//assume createSample sequentially added to pSample TODO: currently hacky 00215 SVMLabels[i].SetIndex(PCL.GetGene(SVMLabels[i].GeneName)); 00216 } 00217 iGene = SVMLabels[i].index; 00218 00219 if (iGene != -1 ) { 00220 x = pSample->problems->x[j]; 00221 predict_label = svm_predict_values(model,x, dec_values); 00222 dec_value = dec_values[0]; //assume that positive class is the first class TODO: currently hacklyi 00223 00224 vecResult[j].GeneName = SVMLabels[i].GeneName; 00225 vecResult[j].Target = SVMLabels[i].Target; 00226 vecResult[j].Value = dec_value; 00227 00228 j++; 00229 00230 } 00231 } 00232 00233 free(pSample); 00234 //delete pSample ; 00235 free(dec_values); 00236 return vecResult; 00237 } 00238 00239 00240 //TODO: classify for dab/dat files 00241 // 00242 00243 }