Sleipnir
src/svmstruct.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "svmstruct.h"
00024 #include "pclset.h"
00025 #include "dataset.h"
00026 #include "meta.h"
00027 #include "genome.h"
00028 #include "compactmatrix.h"
00029 #include <vector>
00030 #include <set>
00031 
00032 #define  SLACK_RESCALING    1
00033 #define  MARGIN_RESCALING   2
00034 
00035 namespace SVMArc {
00036     extern "C" {
00037         //    void free_struct_model(STRUCTMODEL sm);
00038         void free_struct_sample(SAMPLE s);
00039         //    void svm_learn_struct_joint_custom(SAMPLE sample,
00040         //            STRUCT_LEARN_PARM *sparm,
00041         //            LEARN_PARM *lparm, KERNEL_PARM *kparm,
00042         //            STRUCTMODEL *sm);
00043         //    SAMPLE read_struct_examples_sleipnir(DOC **all_docs, double*all_labels, int example_size, int total_features, STRUCT_LEARN_PARM *sparm);
00044         //    void free_struct_model(STRUCTMODEL sm);
00045         //    void free_struct_sample(SAMPLE s);
00046         //    void set_struct_verbosity(long verb);
00047         //    double estimate_r_delta_average(DOC **, long, KERNEL_PARM *);
00048         //    MODEL *read_model(char *);
00049         LABEL classify_struct_example(PATTERN x, STRUCTMODEL *sm,
00050             STRUCT_LEARN_PARM *sparm);
00051         DOC* create_example(long, long, long, double, SVECTOR *);
00052         SVECTOR * create_svector(WORD *, char *, double);
00053         void set_struct_verbosity(long verb);
00054 
00055     }
00056 
00057     void CSVMSTRUCTMC::SetVerbosity(size_t V) {
00058         struct_verbosity = (long) V;
00059     }
00060 
00061     bool CSVMSTRUCTMC::initialize() {
00062 
00063         //set directionality
00064 
00065 
00066         /* set default */
00067         Alg = DEFAULT_ALG_TYPE;
00068         //Learn_parms
00069         struct_parm.C=0.01;
00070         struct_parm.slack_norm=1;
00071         struct_parm.epsilon=DEFAULT_EPS;
00072         struct_parm.custom_argc=0;
00073         struct_parm.loss_function=DEFAULT_LOSS_FCT;
00074         struct_parm.loss_type=DEFAULT_RESCALING;
00075         struct_parm.newconstretrain=100;
00076         struct_parm.ccache_size=5;
00077         struct_parm.batch_size=100;
00078         //Learn_parms
00079         //strcpy (learn_parm.predfile, "trans_predictions");
00080         strcpy(learn_parm.alphafile, "");
00081         //verbosity=0;/*verbosity for svm_light*/
00082         //struct_verbosity = 1; /*verbosity for struct learning portion*/
00083         learn_parm.biased_hyperplane=1;
00084         learn_parm.remove_inconsistent=0;
00085         learn_parm.skip_final_opt_check=0;
00086         learn_parm.svm_maxqpsize=10;
00087         learn_parm.svm_newvarsinqp=0;
00088         learn_parm.svm_iter_to_shrink=-9999;
00089         learn_parm.maxiter=100000;
00090         learn_parm.kernel_cache_size=40;
00091         learn_parm.svm_c=99999999;  /* overridden by struct_parm.C */
00092         learn_parm.eps=0.001;       /* overridden by struct_parm.epsilon */
00093         learn_parm.transduction_posratio=-1.0;
00094         learn_parm.svm_costratio=1.0;
00095         learn_parm.svm_costratio_unlab=1.0;
00096         learn_parm.svm_unlabbound=1E-5;
00097         learn_parm.epsilon_crit=0.001;
00098         learn_parm.epsilon_a=1E-10;  /* changed from 1e-15 */
00099         learn_parm.compute_loo=0;
00100         learn_parm.rho=1.0;
00101         learn_parm.xa_depth=0;
00102         kernel_parm.kernel_type=0;
00103         kernel_parm.poly_degree=3;
00104         kernel_parm.rbf_gamma=1.0;
00105         kernel_parm.coef_lin=1;
00106         kernel_parm.coef_const=1;
00107         strcpy(kernel_parm.custom, "empty");
00108 
00109         if (learn_parm.svm_iter_to_shrink == -9999) {
00110             learn_parm.svm_iter_to_shrink = 100;
00111         }
00112 
00113         if ((learn_parm.skip_final_opt_check)
00114             && (kernel_parm.kernel_type == LINEAR)) {
00115                 printf(
00116                     "\nIt does not make sense to skip the final optimality check for linear kernels.\n\n");
00117                 learn_parm.skip_final_opt_check = 0;
00118         }
00119 
00120         //struct parms
00121 
00122         /* set number of features to -1, indicating that it will be computed
00123         in init_struct_model() */
00124         struct_parm.num_features = -1;
00125 
00126         return true;
00127     }
00128 
00129     bool CSVMSTRUCTMC::parms_check() {
00130         if ((learn_parm.skip_final_opt_check) && (learn_parm.remove_inconsistent)) {
00131             fprintf(
00132                 stderr,
00133                 "\nIt is necessary to do the final optimality check when removing inconsistent \nexamples.\n");
00134             return false;
00135         }
00136         if ((learn_parm.svm_maxqpsize < 2)) {
00137             fprintf(
00138                 stderr,
00139                 "\nMaximum size of QP-subproblems not in valid range: %ld [2..]\n",
00140                 learn_parm.svm_maxqpsize);
00141             return false;
00142         }
00143         if ((learn_parm.svm_maxqpsize < learn_parm.svm_newvarsinqp)) {
00144             fprintf(
00145                 stderr,
00146                 "\nMaximum size of QP-subproblems [%ld] must be larger than the number of\n",
00147                 learn_parm.svm_maxqpsize);
00148             fprintf(
00149                 stderr,
00150                 "new variables [%ld] entering the working set in each iteration.\n",
00151                 learn_parm.svm_newvarsinqp);
00152             return false;
00153         }
00154         if (learn_parm.svm_iter_to_shrink < 1) {
00155             fprintf(
00156                 stderr,
00157                 "\nMaximum number of iterations for shrinking not in valid range: %ld [1,..]\n",
00158                 learn_parm.svm_iter_to_shrink);
00159             return false;
00160         }
00161         if (struct_parm.C < 0) {
00162             fprintf(
00163                 stderr,
00164                 "\nTrade-off between training error and margin is not set (C<0)!\nC value will be set to default value. Clight = Cpef * 100 / n \n");
00165         }
00166         if (learn_parm.transduction_posratio > 1) {
00167             fprintf(stderr,
00168                 "\nThe fraction of unlabeled examples to classify as positives must\n");
00169             fprintf(stderr, "be less than 1.0 !!!\n\n");
00170             return false;
00171         }
00172         if (learn_parm.svm_costratio <= 0) {
00173             fprintf(stderr,
00174                 "\nThe COSTRATIO parameter must be greater than zero!\n\n");
00175             return false;
00176         }
00177         if (struct_parm.epsilon <= 0) {
00178             fprintf(stderr,
00179                 "\nThe epsilon parameter must be greater than zero!\n\n");
00180             return false;
00181         }
00182         if ((struct_parm.slack_norm < 1) || (struct_parm.slack_norm > 2)) {
00183             fprintf(stderr,
00184                 "\nThe norm of the slacks must be either 1 (L1-norm) or 2 (L2-norm)!\n\n");
00185             return false;
00186         }
00187 
00188         if ((struct_parm.loss_type != SLACK_RESCALING) && (struct_parm.loss_type
00189             != MARGIN_RESCALING)) {
00190                 fprintf(
00191                     stderr,
00192                     "\nThe loss type must be either 1 (slack rescaling) or 2 (margin rescaling)!\n\n");
00193                 return false;
00194         }
00195 
00196         if (learn_parm.rho < 0) {
00197             fprintf(stderr,
00198                 "\nThe parameter rho for xi/alpha-estimates and leave-one-out pruning must\n");
00199             fprintf(stderr,
00200                 "be greater than zero (typically 1.0 or 2.0, see T. Joachims, Estimating the\n");
00201             fprintf(stderr,
00202                 "Generalization Performance of an SVM Efficiently, ICML, 2000.)!\n\n");
00203             return false;
00204         }
00205         if ((learn_parm.xa_depth < 0) || (learn_parm.xa_depth > 100)) {
00206             fprintf(stderr,
00207                 "\nThe parameter depth for ext. xi/alpha-estimates must be in [0..100] (zero\n");
00208             fprintf(stderr,
00209                 "for switching to the conventional xa/estimates described in T. Joachims,\n");
00210             fprintf(
00211                 stderr,
00212                 "Estimating the Generalization Performance of an SVM Efficiently, ICML, 2000.)\n");
00213             return false;
00214         }
00215 
00216 
00217 
00218         return true;
00219     }
00220 
00221 
00222 
00223     DOC* CSVMSTRUCTMC::CreateDoc(Sleipnir::CPCL &PCL, size_t iGene, size_t iDoc) {
00224         WORD* aWords;
00225         size_t i, j, iWord, iWords, iPCL, iExp;
00226         float d;
00227         DOC* pRet;
00228         pRet->fvec->words[0].weight;
00229         //get number of features
00230         iWords = PCL.GetExperiments();
00231         //cerr<<"Newing WORDS "<<(iWords+1)*sizeof(WORD)<<endl;
00232         aWords = new WORD[iWords + 1];
00233         //set the words
00234         for (i = 0; i < iWords; ++i) {
00235             aWords[i].wnum = i + 1;
00236             if (!Sleipnir::CMeta::IsNaN(d = PCL.Get(iGene, i)))
00237                 aWords[i].weight = d;
00238             else
00239                 aWords[i].weight = 0;
00240         }
00241         aWords[i].wnum = 0;
00242         // cerr<<"START Create Example"<<endl;
00243         pRet = create_example(iDoc, 0, 0, 1, create_svector(aWords, "", 1));
00244         //cerr<<"END create example"<<endl;
00245         delete[] aWords;
00246         return pRet;
00247     }
00248 
00249 
00250     vector<SVMLabel> CSVMSTRUCTMC::ReadLabels(ifstream & ifsm) {
00251 
00252         static const size_t c_iBuffer = 1024;
00253         char acBuffer[c_iBuffer];
00254         vector<string> vecstrTokens;
00255         vector<SVMLabel> vecLabels;
00256         size_t numPositives, numNegatives;
00257         numPositives = numNegatives = 0;
00258         while (!ifsm.eof()) {
00259             ifsm.getline(acBuffer, c_iBuffer - 1);
00260             acBuffer[c_iBuffer - 1] = 0;
00261             vecstrTokens.clear();
00262             CMeta::Tokenize(acBuffer, vecstrTokens);
00263             if (vecstrTokens.empty())
00264                 continue;
00265             if (vecstrTokens.size() != 2) {
00266                 cerr << "Illegal label line (" << vecstrTokens.size() << "): "
00267                     << acBuffer << endl;
00268                 continue;
00269             }
00270             vecLabels.push_back(SVMArc::SVMLabel(vecstrTokens[0], atoi(
00271                 vecstrTokens[1].c_str())));
00272             if (vecLabels.back().Target > 0)
00273                 numPositives++;
00274             else
00275                 numNegatives++;
00276         }
00277         return vecLabels;
00278     }
00279 
00280 
00281     SAMPLE* CSVMSTRUCTMC::CreateSample(Sleipnir::CPCL &PCL, vector<SVMLabel> SVMLabels) {
00282         size_t i, j, iGene, iDoc;
00283         int     n;       /* number of examples */
00284         int *target;
00285         long num_classes=0;
00286         SAMPLE* pSample = new SAMPLE;
00287         EXAMPLE* examples;
00288         DOC** docs;
00289         vector<DOC*> vec_pDoc;
00290         vec_pDoc.reserve(SVMLabels.size());
00291         vector<int> vecClass;
00292         vecClass.reserve(SVMLabels.size());
00293 
00294         iDoc = 0;
00295         float numPositives, numNegatives;
00296         numPositives = numNegatives = 0;
00297         for (i = 0; i < SVMLabels.size(); i++) {
00298             //     cout<< "processing gene " << SVMLabels[i].GeneName << endl;
00299             if (!SVMLabels[i].hasIndex) {
00300                 SVMLabels[i].SetIndex(PCL.GetGene(SVMLabels[i].GeneName));
00301             }
00302             iGene = SVMLabels[i].index;
00303             //   cout << SVMLabels[i].GeneName<<" gene at location "<<iGene << endl;
00304             if (iGene != -1) {
00305                 //       cout << "creating doc" << endl;
00306                 iDoc++;
00307                 vec_pDoc.push_back(CreateDoc(PCL, iGene, iDoc - 1));
00308                 vecClass.push_back(SVMLabels[i].Target);
00309             }
00310         }
00311 
00312         //copy patterns and labels to new vector
00313         docs = new DOC*[vec_pDoc.size()];
00314         n = vec_pDoc.size();
00315         //cout << "Read in " << n << "Standards"<<endl;
00316         copy(vec_pDoc.begin(), vec_pDoc.end(), docs);
00317         vec_pDoc.clear();
00318 
00319         //cerr << "NEW Class array" << endl;
00320         target = new int[vecClass.size()];
00321         copy(vecClass.begin(), vecClass.end(), target);
00322         vecClass.clear();
00323 
00324 
00325 
00326         examples=(EXAMPLE *)my_malloc(sizeof(EXAMPLE)*n);
00327         for(i=0;i<n;i++)     /* find highest class label */
00328             if(num_classes < target[i]) 
00329                 num_classes=target[i];
00330 
00331         for(i=0;i<n;i++)     /* make sure all class labels are positive */
00332             if(target[i]<1) {
00333                 printf("\nERROR: The class label '%d' of example number %ld is not greater than '1'!\n",target[i],i+1);
00334                 exit(1);
00335             } 
00336             for(i=0;i<n;i++) {          /* copy docs over into new datastructure */
00337                 examples[i].x.doc=docs[i];
00338                 examples[i].y.Class=target[i];
00339                 examples[i].y.scores=NULL;
00340                 examples[i].y.num_classes=num_classes;
00341             }
00342             free(target);
00343             free(docs);
00344             pSample->n=n;
00345             pSample->examples=examples;
00346 
00347             if(struct_verbosity>=0)
00348                 printf(" (%d examples) ",pSample->n);
00349 
00350 
00351 
00352 
00353             return pSample;
00354             //cerr<<"DONE CreateSample"<<endl;
00355     }
00356 
00357     //Single gene classification
00358 
00359     vector<Result> CSVMSTRUCTMC::Classify(Sleipnir::CPCL &PCL,
00360         vector<SVMLabel> SVMLabels) {
00361             size_t i, j,k, iGene, iDoc;
00362             vector<int> vecClass;
00363             vector<Result> vecResult;
00364             iDoc = 0;
00365             PATTERN pattern;
00366             pattern.totdoc = 1;
00367             cerr << "CLASSIFY classifying " << endl;
00368             LABEL label;
00369             for (i = 0; i < SVMLabels.size(); i++) {
00370                 if (!SVMLabels[i].hasIndex) {
00371                     SVMLabels[i].SetIndex(PCL.GetGene(SVMLabels[i].GeneName));
00372                 }
00373                 iGene = SVMLabels[i].index;
00374                 //cout << "CLASS gene=" << iGene << endl;
00375                 if (iGene != -1) {
00376                     iDoc++;
00377 
00378                     //cout << "CLASS iDOC=" << iDoc << endl;
00379                     pattern.doc = CreateDoc(PCL, iGene, iDoc);
00380                     //cerr<<"Doc Created"<<endl;
00381                     label   = classify_struct_example(pattern, &structmodel,
00382                         &struct_parm);
00383                     //cerr<<"CLASSIED"<<endl;
00384                     vecClass.push_back(SVMLabels[i].Target);
00385                     vecResult.resize(iDoc);
00386                     vecResult[iDoc - 1].GeneName = SVMLabels[i].GeneName;
00387                     vecResult[iDoc - 1].Target = SVMLabels[i].Target;
00388                     vecResult[iDoc - 1].Value = label.Class;
00389                     vecResult[iDoc - 1].num_class=struct_parm.num_classes;
00390                     //vecResult[iDoc - 1].Scores.reserve(label.num_classes);
00391                     for (k = 1; k <= struct_parm.num_classes; k++)
00392                         vecResult[iDoc - 1].Scores.push_back(label.scores[k]);
00393                     //cerr<<"CLASSIFY Called FreeDoc"<<endl;
00394                     FreeDoc(pattern.doc);
00395                     //cerr<<"CLASSIFY End FreeDoc"<<endl;
00396                 }
00397             }
00398 
00399             return vecResult;
00400     }
00401 
00402 
00403     void CSVMSTRUCTMC::FreeSample_leave_Doc(SAMPLE s){
00404         /* Frees the memory of sample s. */
00405         int i;
00406         for(i=0;i<s.n;i++) {
00407             free(s.examples[i].x.doc);
00408             free_label(s.examples[i].y);
00409         }
00410         free(s.examples);
00411     }
00412 
00413 
00414 
00415 }
00416