Sleipnir: src/libsvm.cpp Source File

00001 /*****************************************************************************
00002  * This file is provided under the Creative Commons Attribution 3.0 license.
00003  *
00004  * You are free to share, copy, distribute, transmit, or adapt this work
00005  * PROVIDED THAT you attribute the work to the authors listed below.
00006  * For more information, please see the following web page:
00007  * http://creativecommons.org/licenses/by/3.0/
00008  *
00009  * This file is a component of the Sleipnir library for functional genomics,
00010  * authored by:
00011  * Curtis Huttenhower (chuttenh@princeton.edu)
00012  * Mark Schroeder
00013  * Maria D. Chikina
00014  * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015  *
00016  * If you use this library, the included executable tools, or any related
00017  * code in your work, please cite the following publication:
00018  * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019  * Olga G. Troyanskaya.
00020  * "The Sleipnir library for computational functional genomics"
00021  *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "libsvm.h"
00024 #include "pclset.h"
00025 #include "dataset.h"
00026 #include "meta.h"
00027 #include "genome.h"
00028 #include "compactmatrix.h"
00029 #include <vector>
00030 #include <set>
00031 
00032 #define Malloc(type,n) (type *)malloc((n)*sizeof(type)) 
00033 
00034 //#include <libsvm.h>
00035 
00036 namespace LIBSVM {
00037 
00038 #include "libsvm.h"
00039   struct svm_node* CLIBSVM::x_space = NULL;
00040 
00041   bool CLIBSVM::initialize() {
00042 
00043     /* set default */
00044 
00045     parm.cache_size = 100;
00046     parm.C = 0.01;
00047     parm.eps = 1e-3;
00048     parm.svm_type = C_SVC;
00049     parm.p = 0.1;
00050     parm.shrinking = 1;
00051     parm.nr_weight = 0;
00052     parm.weight_label = NULL;
00053     parm.weight = NULL;
00054     parm.probability = 0;
00055     parm.nu = 0.5;
00056     parm.coef0 = 0;
00057     parm.gamma = 0;
00058     parm.degree = 3;
00059     parm.kernel_type = LINEAR;
00060 
00061     model = NULL;
00062 
00063     balance = 0;//Off
00064 
00065     return true;
00066   }
00067 
00068   bool CLIBSVM::parms_check() {
00069     if (parm.C < 0) {
00070       fprintf(
00071           stderr,
00072           "\nTrade-off between training error and margin is not set (C<0)!\nC value will be set to default value. Clight = Cpef * 100 / n \n");
00073       fprintf(stderr, "be less than 1.0 !!!\n\n");
00074       return false;
00075     }
00076     if (parm.eps <= 0) {
00077       fprintf(stderr,
00078           "\nThe epsilon parameter must be greater than zero!\n\n");
00079       return false;
00080     }
00081 
00082     if (parm.nu < 0 | parm.nu > 1) {
00083       fprintf(stderr, "nu parameter must be between 0 and 1");
00084       return false;
00085     }
00086 
00087     //TODO: add more parameter checks 
00088 
00089     return true;
00090   }
00091 
00092   void CLIBSVM::SetXSpace(Sleipnir::CPCL& PCL) {
00093     size_t j, k, iGene, numFeatures, numLabels;
00094     std::vector<std::string> vecGeneNames;
00095     string GeneName;
00096     float d;
00097 
00098     numFeatures = PCL.GetExperiments();
00099     numLabels = PCL.GetGenes();
00100     vecGeneNames = PCL.GetGeneNames();
00101 
00102     cerr << "total data" << endl;
00103     cerr << "number of features (columns) used: " << numFeatures << endl;
00104     cerr << "number of labels in data: " << numLabels << endl;
00105     cerr << "number of gene (rows) names: " << vecGeneNames.size() << endl;
00106 
00107     x_space = Malloc(struct svm_node, (1+numFeatures) * numLabels);
00108 
00109     j = 0;//element index
00110 
00111     for ( std::vector<std::string>::iterator it = vecGeneNames.begin(); it != vecGeneNames.end(); ++it) {
00112       GeneName = *it;
00113       iGene = PCL.GetGene(GeneName); 
00114 
00115       for ( k = 0; k < numFeatures; k++){
00116         x_space[j].index = k;
00117         if (!Sleipnir::CMeta::IsNaN(d = PCL.Get(iGene, k))) {
00118           x_space[j].value = d;
00119         }else{
00120           // impute 0 for missing values
00121           x_space[j].value = 0;
00122         }
00123         j++;
00124       }
00125 
00126       x_space[j].index = -1;
00127       j++;
00128     }
00129 
00130 
00131   }
00132 
00133   SAMPLE * CLIBSVM::CreateSample(Sleipnir::CPCL& PCL, vector<SVMLabel> SVMLabels) {
00134     size_t i, j, k, s, iGene, iProblem, numFeatures, numLabels, max_index;
00135     float d;
00136 
00137     struct svm_problem* prob;
00138 
00139     prob = Malloc(struct svm_problem,1);
00140 
00141     numFeatures = PCL.GetExperiments();
00142     numLabels = 0;
00143 
00144     iProblem = 0;
00145 
00146     for (i = 0; i < SVMLabels.size(); i++) {
00147       if (!SVMLabels[i].hasIndex){
00148         SVMLabels[i].SetIndex(PCL.GetGene(SVMLabels[i].GeneName));
00149       }
00150       iGene = SVMLabels[i].index;
00151       if (iGene != -1) {
00152         numLabels++;
00153       }
00154     }
00155 
00156     cerr << "sampled data" << endl;
00157     cerr << "number of features used: " << numFeatures << endl;
00158     cerr << "number of labels given: " << SVMLabels.size() << endl;
00159     cerr << "number of labels in data: " << numLabels << endl;
00160 
00161     prob->l = numLabels;
00162     prob->y = Malloc(double,numLabels);
00163     prob->x = Malloc(struct svm_node *, numLabels);
00164 
00165     if(x_space == NULL) {
00166       SetXSpace(PCL);
00167     }
00168 
00169     max_index = numFeatures;
00170 
00171     s = 0;//sample index
00172     for (i = 0; i < SVMLabels.size(); i++) {
00173       iGene = SVMLabels[i].index;
00174 
00175       if (iGene != -1){
00176         (prob->x)[s] = &x_space[iGene*(1+numFeatures)];
00177         (prob->y)[s] = SVMLabels[i].Target;
00178         s++;
00179       }
00180     }
00181 
00182     SAMPLE* pSample = new SAMPLE;
00183 
00184     pSample->n = prob->l;//number of labels
00185     pSample->problems = prob;
00186     pSample->numFeatures = numFeatures;
00187     return pSample;
00188   }
00189 
00190   //TODO: create sample for dab/dat files
00191   //
00192 
00193   vector<Result> CLIBSVM::Classify(Sleipnir::CPCL &PCL,
00194       vector<SVMLabel> SVMLabels) {
00195     size_t i, j, iGene;
00196     double predict_label;
00197     double* dec_values;
00198     double dec_value;
00199     struct svm_node *x;
00200 
00201     SAMPLE* pSample;
00202     vector<Result> vecResult;
00203 
00204     pSample = CLIBSVM::CreateSample(PCL, SVMLabels);
00205 
00206     int svm_type = svm_get_svm_type(model);
00207     int nr_class = svm_get_nr_class(model);
00208 
00209     dec_values = Malloc(double, nr_class*(nr_class-1)/2);
00210     vecResult.resize(pSample->n);
00211 
00212     j= 0; //pSample index
00213     for(i = 0 ; i < SVMLabels.size() ; i++){
00214       if(!SVMLabels[i].hasIndex){//assume createSample sequentially added to pSample TODO: currently hacky
00215         SVMLabels[i].SetIndex(PCL.GetGene(SVMLabels[i].GeneName));
00216       }
00217       iGene = SVMLabels[i].index;
00218 
00219       if (iGene != -1 ) {    
00220         x = pSample->problems->x[j];
00221         predict_label = svm_predict_values(model,x, dec_values);
00222         dec_value = dec_values[0]; //assume that positive class is the first class TODO: currently hacklyi
00223 
00224         vecResult[j].GeneName = SVMLabels[i].GeneName;
00225         vecResult[j].Target = SVMLabels[i].Target;
00226         vecResult[j].Value = dec_value;
00227 
00228         j++;
00229 
00230       }
00231     }
00232 
00233     free(pSample);
00234     //delete pSample ;
00235     free(dec_values);
00236     return vecResult;
00237   }
00238 
00239 
00240   //TODO: classify for dab/dat files
00241   //
00242 
00243 }