Sleipnir
src/svmperf.cpp
00001 /*****************************************************************************
00002  * This file is provided under the Creative Commons Attribution 3.0 license.
00003  *
00004  * You are free to share, copy, distribute, transmit, or adapt this work
00005  * PROVIDED THAT you attribute the work to the authors listed below.
00006  * For more information, please see the following web page:
00007  * http://creativecommons.org/licenses/by/3.0/
00008  *
00009  * This file is a component of the Sleipnir library for functional genomics,
00010  * authored by:
00011  * Curtis Huttenhower (chuttenh@princeton.edu)
00012  * Mark Schroeder
00013  * Maria D. Chikina
00014  * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015  *
00016  * If you use this library, the included executable tools, or any related
00017  * code in your work, please cite the following publication:
00018  * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019  * Olga G. Troyanskaya.
00020  * "The Sleipnir library for computational functional genomics"
00021  *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "svmperf.h"
00024 #include "pclset.h"
00025 #include "dataset.h"
00026 #include "meta.h"
00027 #include "genome.h"
00028 #include "compactmatrix.h"
00029 #include <vector>
00030 #include <set>
00031 
00032 #define  SLACK_RESCALING    1
00033 #define  MARGIN_RESCALING   2
00034 
00035 namespace SVMLight {
00036 #include "svmperf.h"
00037 extern "C" {
00038 //    void free_struct_model(STRUCTMODEL sm);
00039 void free_struct_sample(SAMPLE s);
00040 //    void svm_learn_struct_joint_custom(SAMPLE sample,
00041 //            STRUCT_LEARN_PARM *sparm,
00042 //            LEARN_PARM *lparm, KERNEL_PARM *kparm,
00043 //            STRUCTMODEL *sm);
00044 //    SAMPLE read_struct_examples_sleipnir(DOC **all_docs, double*all_labels, int example_size, int total_features, STRUCT_LEARN_PARM *sparm);
00045 //    void free_struct_model(STRUCTMODEL sm);
00046 //    void free_struct_sample(SAMPLE s);
00047 //    void set_struct_verbosity(long verb);
00048 //    double estimate_r_delta_average(DOC **, long, KERNEL_PARM *);
00049 //    MODEL *read_model(char *);
00050 LABEL classify_struct_example(PATTERN x, STRUCTMODEL *sm,
00051         STRUCT_LEARN_PARM *sparm);
00052 DOC* create_example(long, long, long, double, SVECTOR *);
00053 SVECTOR * create_svector(WORD *, char *, double);
00054 void set_struct_verbosity(long verb);
00055 
00056 }
00057 
00058 void CSVMPERF::SetVerbosity(size_t V) {
00059     struct_verbosity = (long) V;
00060 }
00061 
00062 bool CSVMPERF::initialize() {
00063 
00064     //set directionality
00065 
00066 
00067     /* set default */
00068 
00069     //Learn_parms
00070     struct_parm.C = 0.01;
00071     struct_parm.slack_norm = 1;
00072     struct_parm.epsilon = DEFAULT_EPS;
00073     struct_parm.custom_argc = 0;
00074     struct_parm.loss_function = 4;
00075     struct_parm.loss_type = DEFAULT_RESCALING;
00076     struct_parm.newconstretrain = 100;
00077     struct_parm.ccache_size = 5;
00078     struct_parm.batch_size = 100;
00079     struct_parm.bias_featurenum = 0;
00080 
00081     //Learn_parms
00082     //strcpy (learn_parm.predfile, "trans_predictions");
00083     strcpy(learn_parm.alphafile, "");
00084     //verbosity=0;/*verbosity for svm_light*/
00085     //struct_verbosity = 1; /*verbosity for struct learning portion*/
00086     learn_parm.biased_hyperplane = 1;
00087     learn_parm.remove_inconsistent = 0;
00088     learn_parm.skip_final_opt_check = 0;
00089     learn_parm.svm_maxqpsize = 10;
00090     learn_parm.svm_newvarsinqp = 0;
00091     learn_parm.svm_iter_to_shrink = -9999;
00092     learn_parm.maxiter = 1000;
00093     learn_parm.kernel_cache_size = 40;
00094     learn_parm.svm_c = 99999999; /* overridden by struct_parm->C */
00095     learn_parm.eps = 0.001; /* overridden by struct_parm->epsilon */
00096     learn_parm.transduction_posratio = -1.0;
00097     learn_parm.svm_costratio = 1.0;
00098     learn_parm.svm_costratio_unlab = 1.0;
00099     learn_parm.svm_unlabbound = 1E-5;
00100     learn_parm.epsilon_crit = 0.001;
00101     learn_parm.epsilon_a = 1E-15; /* changed from 1e-15 */
00102     learn_parm.compute_loo = 0;
00103     learn_parm.rho = 1.0;
00104     learn_parm.xa_depth = 0;
00105 
00106     //kernel parms
00107     kernel_parm.kernel_type = 0;
00108     kernel_parm.poly_degree = 3;
00109     kernel_parm.rbf_gamma = 1.0;
00110     kernel_parm.coef_lin = 1;
00111     kernel_parm.coef_const = 1;
00112     strcpy(kernel_parm.custom, "empty");
00113 
00114     if (learn_parm.svm_iter_to_shrink == -9999) {
00115         learn_parm.svm_iter_to_shrink = 100;
00116     }
00117 
00118     if ((learn_parm.skip_final_opt_check)
00119             && (kernel_parm.kernel_type == LINEAR)) {
00120         printf(
00121                 "\nIt does not make sense to skip the final optimality check for linear kernels.\n\n");
00122         learn_parm.skip_final_opt_check = 0;
00123     }
00124 
00125     //struct parms
00126     struct_parm.bias = 0;
00127     struct_parm.prec_rec_k_frac = 0.5;
00128     struct_parm.sparse_kernel_type = LINEAR;
00129     struct_parm.sparse_kernel_size = 500;
00130     struct_parm.shrinking = 1;
00131     strcpy(struct_parm.sparse_kernel_file, "");
00132     /* set number of features to -1, indicating that it will be computed
00133      in init_struct_model() */
00134     struct_parm.num_features = -1;
00135 
00136     return true;
00137 }
00138 
00139 bool CSVMPERF::parms_check() {
00140     if ((learn_parm.skip_final_opt_check) && (learn_parm.remove_inconsistent)) {
00141         fprintf(
00142                 stderr,
00143                 "\nIt is necessary to do the final optimality check when removing inconsistent \nexamples.\n");
00144         return false;
00145     }
00146     if ((learn_parm.svm_maxqpsize < 2)) {
00147         fprintf(
00148                 stderr,
00149                 "\nMaximum size of QP-subproblems not in valid range: %ld [2..]\n",
00150                 learn_parm.svm_maxqpsize);
00151         return false;
00152     }
00153     if ((learn_parm.svm_maxqpsize < learn_parm.svm_newvarsinqp)) {
00154         fprintf(
00155                 stderr,
00156                 "\nMaximum size of QP-subproblems [%ld] must be larger than the number of\n",
00157                 learn_parm.svm_maxqpsize);
00158         fprintf(
00159                 stderr,
00160                 "new variables [%ld] entering the working set in each iteration.\n",
00161                 learn_parm.svm_newvarsinqp);
00162         return false;
00163     }
00164     if (learn_parm.svm_iter_to_shrink < 1) {
00165         fprintf(
00166                 stderr,
00167                 "\nMaximum number of iterations for shrinking not in valid range: %ld [1,..]\n",
00168                 learn_parm.svm_iter_to_shrink);
00169         return false;
00170     }
00171     if (struct_parm.C < 0) {
00172         fprintf(
00173                 stderr,
00174                 "\nTrade-off between training error and margin is not set (C<0)!\nC value will be set to default value. Clight = Cpef * 100 / n \n");
00175     }
00176     if (learn_parm.transduction_posratio > 1) {
00177         fprintf(stderr,
00178                 "\nThe fraction of unlabeled examples to classify as positives must\n");
00179         fprintf(stderr, "be less than 1.0 !!!\n\n");
00180         return false;
00181     }
00182     if (learn_parm.svm_costratio <= 0) {
00183         fprintf(stderr,
00184                 "\nThe COSTRATIO parameter must be greater than zero!\n\n");
00185         return false;
00186     }
00187     if (struct_parm.epsilon <= 0) {
00188         fprintf(stderr,
00189                 "\nThe epsilon parameter must be greater than zero!\n\n");
00190         return false;
00191     }
00192     if ((struct_parm.slack_norm < 1) || (struct_parm.slack_norm > 2)) {
00193         fprintf(stderr,
00194                 "\nThe norm of the slacks must be either 1 (L1-norm) or 2 (L2-norm)!\n\n");
00195         return false;
00196     }
00197 
00198     if ((struct_parm.loss_type != SLACK_RESCALING) && (struct_parm.loss_type
00199             != MARGIN_RESCALING)) {
00200         fprintf(
00201                 stderr,
00202                 "\nThe loss type must be either 1 (slack rescaling) or 2 (margin rescaling)!\n\n");
00203         return false;
00204     }
00205 
00206     if (learn_parm.rho < 0) {
00207         fprintf(stderr,
00208                 "\nThe parameter rho for xi/alpha-estimates and leave-one-out pruning must\n");
00209         fprintf(stderr,
00210                 "be greater than zero (typically 1.0 or 2.0, see T. Joachims, Estimating the\n");
00211         fprintf(stderr,
00212                 "Generalization Performance of an SVM Efficiently, ICML, 2000.)!\n\n");
00213         return false;
00214     }
00215     if ((learn_parm.xa_depth < 0) || (learn_parm.xa_depth > 100)) {
00216         fprintf(stderr,
00217                 "\nThe parameter depth for ext. xi/alpha-estimates must be in [0..100] (zero\n");
00218         fprintf(stderr,
00219                 "for switching to the conventional xa/estimates described in T. Joachims,\n");
00220         fprintf(
00221                 stderr,
00222                 "Estimating the Generalization Performance of an SVM Efficiently, ICML, 2000.)\n");
00223         return false;
00224     }
00225 
00226     /* Note that the validity of the value for struct_parm->prec_rec_k_frac in
00227      relation to #pos is checked in read_struct_examples() */
00228     if (struct_parm.prec_rec_k_frac < 0) {
00229         fprintf(stderr,
00230                 "\nThe value of option --k must be greater then zero!\n\n");
00231         return false;
00232     }
00233 
00234     return true;
00235 }
00236 
00237 DOC* CSVMPERF::CreateDoc(Sleipnir::CPCLSet &PCLSet, size_t iGene, size_t iDoc) {
00238     WORD* aWords;
00239     size_t i, j, iWord, iWords, iPCL, iExp;
00240     float d;
00241     DOC* pRet;
00242     pRet->fvec->words[0].weight;
00243     //get number of features
00244     iWords = 0;
00245     for (i = 0; i < PCLSet.GetPCLs(); i++) {
00246         //    cout<<"CD:PCLSET= "<<i<<endl;
00247         //  cout<<"CD:numExp= "<<PCLSet.Get(i).GetExperiments()<<endl;
00248         iWords += PCLSet.Get(i).GetExperiments();
00249     }
00250     //      cout << "CD:iwords=" << iWords << endl;
00251     aWords = new WORD[iWords + 1];
00252     //number the words
00253     for (i = 0; i < iWords; ++i) {
00254         //   cout<<i<<endl;
00255         aWords[i].wnum = i + 1;
00256         // asWords[ i ].wnum = 0;
00257     }
00258     aWords[i].wnum = 0;
00259     //get the values;
00260     iWord = 0;
00261     for (i = 0; i < PCLSet.GetPCLs(); i++) {
00262         iExp = PCLSet.Get(i).GetExperiments();
00263         for (j = 0; j < iExp; j++) {
00264             //     cout<<"CD:iWord="<<iWord<<endl;
00265             if (!Sleipnir::CMeta::IsNaN(d = PCLSet.Get(i, iGene, j))) {
00266                 //   if (i==0 && j==0)
00267                 //       cout<<"First value is "<<d<<endl;
00268                 aWords[iWord].weight = d;
00269             } else
00270                 aWords[iWord].weight = 0;
00271             iWord++;
00272         }
00273     }
00274     pRet = create_example(iDoc, 0, 0, 1, create_svector(aWords, "", 1));
00275     delete[] aWords;
00276     // cout<<"done creating DOC"<<endl;
00277     return pRet;
00278 }
00279 //For single genes usign single PCL
00280 
00281 DOC* CSVMPERF::CreateDoc(Sleipnir::CPCL &PCL, size_t iGene, size_t iDoc) {
00282     WORD* aWords;
00283     size_t i, j, iWord, iWords, iPCL, iExp;
00284     float d;
00285     DOC* pRet;
00286     pRet->fvec->words[0].weight;
00287     //get number of features
00288     iWords = PCL.GetExperiments();
00289     //cerr<<"Newing WORDS "<<(iWords+1)*sizeof(WORD)<<endl;
00290     aWords = new WORD[iWords + 1];
00291     //set the words
00292     for (i = 0; i < iWords; ++i) {
00293         aWords[i].wnum = i + 1;
00294         if (!Sleipnir::CMeta::IsNaN(d = PCL.Get(iGene, i)))
00295             aWords[i].weight = d;
00296         else
00297             aWords[i].weight = 0;
00298     }
00299     aWords[i].wnum = 0;
00300     // cerr<<"START Create Example"<<endl;
00301     pRet = create_example(iDoc, 0, 0, 1, create_svector(aWords, "", 1));
00302     //cerr<<"END create example"<<endl;
00303     delete[] aWords;
00304     return pRet;
00305 }
00306 //Single Gene using a Dat for data
00307 
00308 DOC* CSVMPERF::CreateDoc(Sleipnir::CDat& Dat, size_t iGene, size_t iDoc) {
00309     WORD* aWords;
00310     size_t i, j, iWord, iWords;
00311     float d;
00312     DOC* pRet;
00313     pRet->fvec->words[0].weight;
00314     //get number of features
00315     iWords = Dat.GetGenes();
00316     //      cout << "CD:iwords=" << iWords << endl;
00317     aWords = new WORD[iWords + 1];
00318     //number the words
00319     for (i = 0; i < iWords; ++i) {
00320         //   cout<<i<<endl;
00321         aWords[i].wnum = i + 1;
00322         // asWords[ i ].wnum = 0;
00323     }
00324     aWords[i].wnum = 0;
00325     //get the values;
00326     iWord = 0;
00327     for (i = 0; i < Dat.GetGenes(); i++) {
00328         if (!Sleipnir::CMeta::IsNaN(d = Dat.Get(iGene, i))) {
00329             //   if (i==0 && j==0)
00330             //       cout<<"First value is "<<d<<endl;
00331             aWords[iWord].weight = d;
00332         } else
00333             aWords[iWord].weight = 0;
00334         iWord++;
00335     }
00336     pRet = create_example(iDoc, 0, 0, 1, create_svector(aWords, "", 1));
00337     delete[] aWords;
00338     // cout<<"done creating DOC"<<endl;
00339     return pRet;
00340 }
00341 
00342 //Create Sample functions for single genes
00343 
00344 SAMPLE* CSVMPERF::CreateSample(Sleipnir::CPCLSet &PCLSet,
00345         vector<SVMLabel> SVMLabels) {
00346     size_t i, j, iGene, iDoc;
00347     vector<DOC*> vec_pDoc;
00348     vector<double> vecClass;
00349     vector<size_t> veciGene;
00350     iDoc = 0;
00351     float numPositives, numNegatives;
00352     numPositives = numNegatives = 0;
00353     for (i = 0; i < SVMLabels.size(); i++) {
00354         //     cout<< "processing gene " << SVMLabels[i].GeneName << endl;
00355         if (!SVMLabels[i].hasIndex) {
00356             SVMLabels[i].SetIndex(PCLSet.GetGene(SVMLabels[i].GeneName));
00357         }
00358         iGene = SVMLabels[i].index;
00359         //   cout << SVMLabels[i].GeneName<<" gene at location "<<iGene << endl;
00360         if (iGene != -1) {
00361             //       cout << "creating doc" << endl;
00362             iDoc++;
00363             vec_pDoc.push_back(CreateDoc(PCLSet, iGene, iDoc - 1));
00364             vecClass.push_back(SVMLabels[i].Target);
00365         }
00366     }
00367 
00368     DOC** ppDoc;
00369     ppDoc = new DOC*[vec_pDoc.size()];
00370     copy(vec_pDoc.begin(), vec_pDoc.end(), ppDoc);
00371     vec_pDoc.clear();
00372     PATTERN* pPattern = new PATTERN;
00373     pPattern->doc = ppDoc;
00374 
00375     pPattern->totdoc = iDoc;
00376     //   cout << "number of document=" << pPattern->totdoc << endl;
00377     LABEL* pLabel = new LABEL;
00378     double* aClass;
00379     aClass = new double[vecClass.size()];
00380     copy(vecClass.begin(), vecClass.end(), aClass);
00381     vecClass.clear();
00382     pLabel->Class = aClass;
00383     pLabel->totdoc = iDoc;
00384 
00385     EXAMPLE* aExample;
00386     aExample = new EXAMPLE[1];
00387     //cout<<"aExample @"<<aExample<<endl;
00388     aExample[0].x = *pPattern;
00389     aExample[0].y = *pLabel;
00390     SAMPLE* pSample = new SAMPLE;
00391     pSample->n = 1;
00392     pSample->examples = aExample;
00393     /* cout << "examples @" << pSample->examples << endl;
00394      cout<< "ppDoc="<<ppDoc<<endl;
00395      cout << "docs @" << pSample->examples[0].x.doc << endl;
00396      cout<<"done creating sample"<<endl;
00397      cout<<"sample @ "<<pSample<<endl;*/
00398     return pSample;
00399 }
00400 
00401 SAMPLE* CSVMPERF::CreateSample(Sleipnir::CPCL &PCL, vector<SVMLabel> SVMLabels) {
00402     size_t i, j, iGene, iDoc;
00403     // cerr<<"CREATE pDoc vector"<<endl;
00404     vector<DOC*> vec_pDoc;
00405     //cerr << "RESERVE pDoc"<<endl;
00406     vec_pDoc.reserve(SVMLabels.size());
00407     //cerr<<"CREATE class"<<endl;
00408     vector<double> vecClass;
00409     //cerr<<"RESERVE class"<<endl;
00410     vecClass.reserve(SVMLabels.size());
00411     iDoc = 0;
00412     float numPositives, numNegatives;
00413     numPositives = numNegatives = 0;
00414     for (i = 0; i < SVMLabels.size(); i++) {
00415         //     cout<< "processing gene " << SVMLabels[i].GeneName << endl;
00416         if (!SVMLabels[i].hasIndex) {
00417             SVMLabels[i].SetIndex(PCL.GetGene(SVMLabels[i].GeneName));
00418         }
00419         iGene = SVMLabels[i].index;
00420         //   cout << SVMLabels[i].GeneName<<" gene at location "<<iGene << endl;
00421         if (iGene != -1) {
00422             //       cout << "creating doc" << endl;
00423             iDoc++;
00424             vec_pDoc.push_back(CreateDoc(PCL, iGene, iDoc - 1));
00425             vecClass.push_back(SVMLabels[i].Target);
00426         }
00427     }
00428 
00429     DOC** ppDoc;
00430     //cerr<<"NEW ppDoc"<<endl;
00431     ppDoc = new DOC*[vec_pDoc.size()];
00432     copy(vec_pDoc.begin(), vec_pDoc.end(), ppDoc);
00433     vec_pDoc.clear();
00434     //cerr<<"NEW Pattern"<<endl;
00435     PATTERN* pPattern = new PATTERN;
00436     pPattern->doc = ppDoc;
00437 
00438     pPattern->totdoc = iDoc;
00439     LABEL* pLabel = new LABEL;
00440     double* aClass;
00441     cerr << "NEW Class array" << endl;
00442     aClass = new double[vecClass.size()];
00443     copy(vecClass.begin(), vecClass.end(), aClass);
00444     vecClass.clear();
00445     pLabel->Class = aClass;
00446     pLabel->totdoc = iDoc;
00447 
00448     EXAMPLE* aExample;
00449     //cerr<<"NEW Example"<<endl;
00450     aExample = new EXAMPLE[1];
00451     //cout<<"aExample @"<<aExample<<endl;
00452     aExample[0].x = *pPattern;
00453     aExample[0].y = *pLabel;
00454     //cerr<<"NEW Sample"<<endl;
00455     SAMPLE* pSample = new SAMPLE;
00456     pSample->n = 1;
00457     pSample->examples = aExample;
00458     /* cout << "examples @" << pSample->examples << endl;
00459      cout<< "ppDoc="<<ppDoc<<endl;
00460      cout << "docs @" << pSample->examples[0].x.doc << endl;
00461      cout<<"done creating sample"<<endl;
00462      cout<<"sample @ "<<pSample<<endl;*/
00463     return pSample;
00464     //cerr<<"DONE CreateSample"<<endl;
00465 }
00466 
00467 SAMPLE** CSVMPERF::CreateSampleBootStrap(Sleipnir::CPCL &PCL,
00468         vector<SVMLabel>& SVMLabels, vector<vector<size_t> > vecvecIndex) {
00469 
00470     size_t i, j, iGene, iDoc;
00471     vector<DOC*> vec_pDoc;
00472     vec_pDoc.reserve(SVMLabels.size());
00473     vector<vector<double> > vecvecClass;
00474     vector<double> vecClass;
00475     vecClass.reserve(SVMLabels.size());
00476     iDoc = 0;
00477     float numPositives, numNegatives;
00478     numPositives = numNegatives = 0;
00479     //Creat all the docs once
00480     for (i = 0; i < SVMLabels.size(); i++) {
00481         //the labels will have an index
00482         iGene = SVMLabels[i].index;
00483         if (iGene != -1) {
00484             iDoc++;
00485             vec_pDoc.push_back(CreateDoc(PCL, iGene, iDoc - 1));
00486             vecClass.push_back(SVMLabels[i].Target);
00487 
00488         }
00489     }
00490     size_t numBootStraps = vecvecIndex.size();
00491     SAMPLE** ppSample = new SAMPLE*[numBootStraps];
00492     DOC** ppDoc;
00493     for (i = 0; i < numBootStraps; i++) {
00494         //get a new ppDoc
00495         ppDoc = new DOC*[vecvecIndex[i].size()];
00496         for (j = 0; j < vecvecIndex[i].size(); j++) {
00497             ppDoc[j] = vec_pDoc[vecvecIndex[i][j]]; //assign the pointer
00498         }
00499         //set up the pattern
00500         PATTERN* pPattern = new PATTERN;
00501         pPattern->doc = ppDoc;
00502         pPattern->totdoc = iDoc;
00503 
00504         //set up the labels
00505         LABEL* pLabel = new LABEL;
00506         double* aClass;
00507         aClass = new double[vecvecIndex[i].size()];
00508         for (j = 0; j < vecvecIndex[i].size(); j++) {
00509             aClass[j] = SVMLabels[vecvecIndex[i][j]].Target;
00510         }
00511 
00512         pLabel->Class = aClass;
00513         pLabel->totdoc = iDoc;
00514 
00515         //set up the Example
00516         EXAMPLE* aExample;
00517         aExample = new EXAMPLE[1];
00518         aExample[0].x = *pPattern;
00519         aExample[0].y = *pLabel;
00520 
00521         //set up the Sample
00522         ppSample[i] = new SAMPLE;
00523         ppSample[i]->n = 1;
00524         ppSample[i]->examples = aExample;
00525     }
00526     return ppSample;
00527 }
00528 
00529 SAMPLE * CSVMPERF::CreateSample(Sleipnir::CDat& Dat, vector<SVMLabel> SVMLabels) {
00530     size_t i, j, iGene, iDoc;
00531     vector<DOC*> vec_pDoc;
00532     vector<double> vecClass;
00533     vector<size_t> veciGene;
00534     iDoc = 0;
00535     float numPositives, numNegatives;
00536     numPositives = numNegatives = 0;
00537     for (i = 0; i < SVMLabels.size(); i++) {
00538         //     cout<< "processing gene " << SVMLabels[i].GeneName << endl;
00539         iGene = Dat.GetGene(SVMLabels[i].GeneName);
00540         //   cout << SVMLabels[i].GeneName<<" gene at location "<<iGene << endl;
00541         if (iGene != -1) {
00542             //       cout << "creating doc" << endl;
00543             iDoc++;
00544             vec_pDoc.push_back(CreateDoc(Dat, iGene, iDoc - 1));
00545             vecClass.push_back(SVMLabels[i].Target);
00546         }
00547     }
00548 
00549     DOC** ppDoc;
00550     ppDoc = new DOC*[vec_pDoc.size()];
00551     copy(vec_pDoc.begin(), vec_pDoc.end(), ppDoc);
00552     vec_pDoc.clear();
00553     PATTERN* pPattern = new PATTERN;
00554     pPattern->doc = ppDoc;
00555 
00556     pPattern->totdoc = iDoc;
00557     //   cout << "number of document=" << pPattern->totdoc << endl;
00558     LABEL* pLabel = new LABEL;
00559     double* aClass;
00560     aClass = new double[vecClass.size()];
00561     copy(vecClass.begin(), vecClass.end(), aClass);
00562     vecClass.clear();
00563     pLabel->Class = aClass;
00564     pLabel->totdoc = iDoc;
00565 
00566     EXAMPLE* aExample;
00567     aExample = new EXAMPLE[1];
00568     //cout<<"aExample @"<<aExample<<endl;
00569     aExample[0].x = *pPattern;
00570     aExample[0].y = *pLabel;
00571     SAMPLE* pSample = new SAMPLE;
00572     pSample->n = 1;
00573     pSample->examples = aExample;
00574     /* cout << "examples @" << pSample->examples << endl;
00575      cout<< "ppDoc="<<ppDoc<<endl;
00576      cout << "docs @" << pSample->examples[0].x.doc << endl;
00577      cout<<"done creating sample"<<endl;
00578      cout<<"sample @ "<<pSample<<endl;*/
00579     return pSample;
00580 }
00581 
00582 //Single gene classification
00583 
00584 vector<Result> CSVMPERF::Classify(Sleipnir::CPCL &PCL,
00585         vector<SVMLabel> SVMLabels) {
00586     size_t i, j, iGene, iDoc;
00587     vector<DOC*> vec_pDoc;
00588     vector<double> vecClass;
00589     vector<Result> vecResult;
00590     iDoc = 0;
00591     DOC** ppDoc;
00592     ppDoc = new DOC*[1];
00593     PATTERN pattern;
00594     pattern.doc = ppDoc;
00595     pattern.totdoc = 1;
00596     //cerr << "CLASSIFY classifying " << endl;
00597     LABEL label;
00598     for (i = 0; i < SVMLabels.size(); i++) {
00599         if (!SVMLabels[i].hasIndex) {
00600             SVMLabels[i].SetIndex(PCL.GetGene(SVMLabels[i].GeneName));
00601         }
00602         iGene = SVMLabels[i].index;
00603         //   cout << "CLASS gene=" << iGene << endl;
00604         if (iGene != -1) {
00605             iDoc++;
00606 
00607             //cout << "CLASS iDOC=" << iDoc << endl;
00608             ppDoc[0] = CreateDoc(PCL, iGene, iDoc);
00609             label
00610                     = classify_struct_example(pattern, &structmodel,
00611                             &struct_parm);
00612 
00613             vecClass.push_back(SVMLabels[i].Target);
00614             vecResult.resize(iDoc);
00615             vecResult[iDoc - 1].GeneName = SVMLabels[i].GeneName;
00616             vecResult[iDoc - 1].Target = SVMLabels[i].Target;
00617             vecResult[iDoc - 1].Value = label.Class[0];
00618             //cerr<<"CLASSIFY Called FreeDoc"<<endl;
00619             FreeDoc(ppDoc[0]);
00620             //cerr<<"CLASSIFY End FreeDoc"<<endl;
00621         }
00622     }
00623 
00624     delete[] ppDoc;
00625     return vecResult;
00626 }
00627 
00628 vector<Result> CSVMPERF::Classify(Sleipnir::CPCLSet &PCLSet,
00629         vector<SVMLabel> SVMLabels) {
00630     size_t i, j, iGene, iDoc;
00631     vector<DOC*> vec_pDoc;
00632     vector<double> vecClass;
00633     vector<Result> vecResult;
00634     iDoc = 0;
00635     for (i = 0; i < SVMLabels.size(); i++) {
00636         iGene = PCLSet.GetGene(SVMLabels[i].GeneName);
00637         //   cout << "CLASS gene=" << iGene << endl;
00638         if (iGene != -1) {
00639             iDoc++;
00640             //  cout << "CLASS iDOC=" << iDoc << endl;
00641             vec_pDoc.push_back(CreateDoc(PCLSet, iGene, iDoc));
00642             vecClass.push_back(SVMLabels[i].Target);
00643             vecResult.resize(iDoc);
00644             vecResult[iDoc - 1].GeneName = SVMLabels[i].GeneName;
00645             vecResult[iDoc - 1].Target = SVMLabels[i].Target;
00646         }
00647     }
00648     DOC** ppDoc;
00649     ppDoc = new DOC*[vec_pDoc.size()];
00650     copy(vec_pDoc.begin(), vec_pDoc.end(), ppDoc);
00651     vec_pDoc.clear();
00652     PATTERN pattern;
00653     pattern.doc = ppDoc;
00654     pattern.totdoc = iDoc;
00655 
00656     LABEL label = classify_struct_example(pattern, &structmodel, &struct_parm);
00657     //   cout << "label totdoc=" << label.totdoc << endl;
00658     for (i = 0; i < label.totdoc; i++) {
00659         vecResult[i].Value = label.Class[i];
00660         //     cout << "CLASS: i=" << i << " value=" << vecResult[i].Value << endl;
00661     }
00662     FreePattern(pattern);
00663     return vecResult;
00664 }
00665 
00666 vector<Result> CSVMPERF::Classify(Sleipnir::CDat &Dat,
00667         vector<SVMLabel> SVMLabels) {
00668     size_t i, j, iGene, iDoc;
00669     vector<DOC*> vec_pDoc;
00670     vector<double> vecClass;
00671     vector<Result> vecResult;
00672     iDoc = 0;
00673     for (i = 0; i < SVMLabels.size(); i++) {
00674         iGene = Dat.GetGene(SVMLabels[i].GeneName);
00675         //   cout << "CLASS gene=" << iGene << endl;
00676         if (iGene != -1) {
00677             iDoc++;
00678             //  cout << "CLASS iDOC=" << iDoc << endl;
00679             vec_pDoc.push_back(CreateDoc(Dat, iGene, iDoc));
00680             vecClass.push_back(SVMLabels[i].Target);
00681             vecResult.resize(iDoc);
00682             vecResult[iDoc - 1].GeneName = SVMLabels[i].GeneName;
00683             vecResult[iDoc - 1].Target = SVMLabels[i].Target;
00684         }
00685     }
00686     DOC** ppDoc;
00687     ppDoc = new DOC*[vec_pDoc.size()];
00688     copy(vec_pDoc.begin(), vec_pDoc.end(), ppDoc);
00689     vec_pDoc.clear();
00690     PATTERN pattern;
00691     pattern.doc = ppDoc;
00692     pattern.totdoc = iDoc;
00693 
00694     LABEL label = classify_struct_example(pattern, &structmodel, &struct_parm);
00695     //   cout << "label totdoc=" << label.totdoc << endl;
00696     for (i = 0; i < label.totdoc; i++) {
00697         vecResult[i].Value = label.Class[i];
00698         //     cout << "CLASS: i=" << i << " value=" << vecResult[i].Value << endl;
00699     }
00700     FreePattern(pattern);
00701     return vecResult;
00702 }
00703 
00704 //Create DOC for pais of genes
00705 //For pairs of genes using PCLSet
00706 
00707 DOC* CSVMPERF::CreateDoc(Sleipnir::CPCL &PCL, size_t iGene, size_t jGene,
00708         size_t iDoc) {
00709     WORD* aWords;
00710     size_t i, j, iWord, iWords, iPCL, iExp;
00711     float d, e;
00712     DOC* pRet;
00713     pRet->fvec->words[0].weight;
00714     //get number of features
00715 
00716     iWords = PCL.GetExperiments();
00717 
00718     //      cout << "CD:iwords=" << iWords << endl;
00719     aWords = new WORD[iWords + 1];
00720     //number the words
00721     for (i = 0; i < iWords; ++i) {
00722         //   cout<<i<<endl;
00723         aWords[i].wnum = i + 1;
00724         // asWords[ i ].wnum = 0;
00725     }
00726     aWords[i].wnum = 0;
00727     //get the values;
00728     iWord = 0;
00729 
00730     for (j = 0; j < PCL.GetExperiments(); j++) {
00731         //     cout<<"CD:iWord="<<iWord<<endl;
00732         if (!Sleipnir::CMeta::IsNaN(d = PCL.Get(iGene, j))
00733                 && !Sleipnir::CMeta::IsNaN(e = PCL.Get(jGene, j))) {
00734             //   if (i==0 && j==0)
00735             //       cout<<"First value is "<<d<<endl;
00736             aWords[iWord].weight = d * e;
00737         } else
00738             aWords[iWord].weight = 0;
00739         iWord++;
00740     }
00741 
00742     pRet = create_example(iDoc, 0, 0, 1, create_svector(aWords, "", 1));
00743     delete[] aWords;
00744     // cout<<"done creating DOC"<<endl;
00745     return pRet;
00746 }
00747 
00748 //Create Sample for pairs of genes, wraps the above 2 functions
00749 
00750 SAMPLE* CSVMPERF::CreateSample(Sleipnir::CPCL &PCL, Sleipnir::CDat& Answers,
00751         const vector<string>& CVGenes) {
00752     size_t i, j, iGene, jGene, iDoc;
00753     vector<DOC*> vec_pDoc;
00754     vector<double> vecClass;
00755     vector<size_t> veciGene;
00756     Sleipnir::CPCL* pP = &PCL;
00757     iDoc = 0;
00758     float numPositives, numNegatives;
00759     numPositives = numNegatives = 0;
00760     set<string> setGenes;
00761     set<string>::iterator iterSet;
00762     float w;
00763     for (i = 0; i < CVGenes.size(); i++) {
00764         setGenes.insert(CVGenes[i]);
00765     }
00766     for (i = 0; i < Answers.GetGenes() - 1; i++) {
00767         if ((setGenes.find(Answers.GetGene(i)) != setGenes.end()) && ((iGene
00768                 = PCL.GetGene(Answers.GetGene(i))) != -1)) {
00769             for (j = i + 1; j < Answers.GetGenes(); j++) {
00770                 if ((setGenes.find(Answers.GetGene(j)) != setGenes.end())
00771                         && ((jGene = PCL.GetGene(Answers.GetGene(j))) != -1)) {
00772                     if (!Sleipnir::CMeta::IsNaN(w = Answers.Get(i, j))) {
00773                         iDoc++;
00774                         vec_pDoc.push_back(CreateDoc(PCL, iGene, jGene, iDoc
00775                                 - 1));
00776 
00777                     }
00778                 }
00779             }
00780         }
00781     }
00782 
00783     DOC** ppDoc;
00784     ppDoc = new DOC*[vec_pDoc.size()];
00785     copy(vec_pDoc.begin(), vec_pDoc.end(), ppDoc);
00786     vec_pDoc.clear();
00787     PATTERN* pPattern = new PATTERN;
00788     pPattern->doc = ppDoc;
00789 
00790     pPattern->totdoc = iDoc;
00791     LABEL* pLabel = new LABEL;
00792     double* aClass;
00793     aClass = new double[vecClass.size()];
00794     copy(vecClass.begin(), vecClass.end(), aClass);
00795     vecClass.clear();
00796     pLabel->Class = aClass;
00797     pLabel->totdoc = iDoc;
00798 
00799     EXAMPLE* aExample;
00800     aExample = new EXAMPLE[1];
00801     aExample[0].x = *pPattern;
00802     aExample[0].y = *pLabel;
00803     SAMPLE* pSample = new SAMPLE;
00804     pSample->n = 1;
00805     pSample->examples = aExample;
00806     return pSample;
00807 }
00808 
00809 void CSVMPERF::Classify(Sleipnir::CPCL& PCL, Sleipnir::CDat& Answers,
00810         Sleipnir::CDat& Values, Sleipnir::CDat& Counts,
00811         const vector<string>& CVGenes) {
00812     size_t i, j, iGene, jGene, iDoc;
00813     set<string> setGenes;
00814     set<string>::iterator iterSet;
00815     vector<DOC*> vec_pDoc;
00816     vector<pair<size_t, size_t> > vecPairIndex;
00817     iDoc = 0;
00818     for (i = 0; i < CVGenes.size(); i++) {
00819         setGenes.insert(CVGenes[i]);
00820     }
00821 
00822     cout << "the number of genes  to be classified is " << setGenes.size()
00823             << endl;
00824     for (i = 0; i < Answers.GetGenes() - 1; i++) {
00825         if ((setGenes.find(Answers.GetGene(i)) != setGenes.end()) && ((iGene
00826                 = PCL.GetGene(Answers.GetGene(i))) != -1)) {
00827             for (j = i + 1; j < Answers.GetGenes(); j++) {
00828                 if ((setGenes.find(Answers.GetGene(j)) != setGenes.end())
00829                         && ((jGene = PCL.GetGene(Answers.GetGene(j))) != -1)) {
00830                     if (!Sleipnir::CMeta::IsNaN(Answers.Get(i, j))) {
00831                         iDoc++;
00832                         vec_pDoc.push_back(CreateDoc(PCL, iGene, jGene, iDoc
00833                                 - 1));
00834                         vecPairIndex.push_back(pair<size_t, size_t> (i, j));
00835                     }
00836                 }
00837             }
00838         }
00839     }
00840     DOC** ppDoc;
00841     ppDoc = new DOC*[vec_pDoc.size()];
00842     copy(vec_pDoc.begin(), vec_pDoc.end(), ppDoc);
00843     vec_pDoc.clear();
00844     PATTERN pattern;
00845     pattern.doc = ppDoc;
00846     pattern.totdoc = iDoc;
00847 
00848     LABEL label = classify_struct_example(pattern, &structmodel, &struct_parm);
00849     pair<size_t, size_t> tmpPair;
00850     float d;
00851     for (i = 0; i < vecPairIndex.size(); i++) {
00852         tmpPair = vecPairIndex[i];
00853         if (!Sleipnir::CMeta::IsNaN(d = Values.Get(tmpPair.first,
00854                 tmpPair.second)))
00855             Values.Set(tmpPair.first, tmpPair.second, d + label.Class[i]);
00856         else
00857             Values.Set(tmpPair.first, tmpPair.second, label.Class[i]);
00858         if (!Sleipnir::CMeta::IsNaN(d = Counts.Get(tmpPair.first,
00859                 tmpPair.second)))
00860             Counts.Set(tmpPair.first, tmpPair.second, d + 1);
00861         else
00862             Counts.Set(tmpPair.first, tmpPair.second, 1);
00863     }
00864     FreePattern(pattern);
00865 
00866 }
00867 
00868 void CSVMPERF::ClassifyAll(Sleipnir::CPCL& PCL, Sleipnir::CDat& Values,
00869         Sleipnir::CDat& Counts, const vector<string>& CVGenes) {
00870     size_t iGene, jGene, i, j, k;
00871     string strGeneOne, strGeneTwo;
00872     set<string> setGenes;
00873     set<string>::iterator iterSet;
00874     vector<DOC*> vec_pDoc;
00875     vector<pair<size_t, size_t> > vecPairIndex;
00876     size_t iDoc;
00877     for (i = 0; i < CVGenes.size(); i++) {
00878         setGenes.insert(CVGenes[i]);
00879     }
00880 
00881     for (i = 0; i < PCL.GetGenes() - 1; i++) {
00882         if (setGenes.find(PCL.GetGene(i)) == setGenes.end()) {
00883             iDoc = 0;
00884             for (j = i + 1; j < PCL.GetGenes() - 1; j++) {
00885                 if (setGenes.find(PCL.GetGene(j)) == setGenes.end()) {
00886                     iDoc++;
00887                     vec_pDoc.push_back(CreateDoc(PCL, i, j, iDoc - 1));
00888                     vecPairIndex.push_back(pair<size_t, size_t> (i, j));
00889                 }
00890             }
00891             DOC** ppDoc;
00892             ppDoc = new DOC*[vec_pDoc.size()];
00893             copy(vec_pDoc.begin(), vec_pDoc.end(), ppDoc);
00894             vec_pDoc.clear();
00895             PATTERN pattern;
00896             pattern.doc = ppDoc;
00897             pattern.totdoc = iDoc;
00898 
00899             LABEL label = classify_struct_example(pattern, &structmodel,
00900                     &struct_parm);
00901             pair<size_t, size_t> tmpPair;
00902             float d;
00903             for (k = 0; k < vecPairIndex.size(); k++) {
00904                 tmpPair = vecPairIndex[k];
00905                 if (!Sleipnir::CMeta::IsNaN(d = Values.Get(tmpPair.first,
00906                         tmpPair.second)))
00907                     Values.Set(tmpPair.first, tmpPair.second, d
00908                             + label.Class[k]);
00909                 else
00910                     Values.Set(tmpPair.first, tmpPair.second, label.Class[k]);
00911                 if (!Sleipnir::CMeta::IsNaN(d = Counts.Get(tmpPair.first,
00912                         tmpPair.second)))
00913                     Counts.Set(tmpPair.first, tmpPair.second, d + 1);
00914                 else
00915                     Counts.Set(tmpPair.first, tmpPair.second, 1);
00916             }
00917             vecPairIndex.resize(0);
00918             FreePattern(pattern);
00919         }
00920     }
00921 }
00922 
00923 void CSVMPERF::ClassifyAll(Sleipnir::CPCL& PCL, Sleipnir::CDat& Values,
00924         const vector<string>& CVGenes) {
00925     size_t iGene, jGene, i, j, k;
00926     string strGeneOne, strGeneTwo;
00927     set<string> setGenes;
00928     set<string>::iterator iterSet;
00929     vector<DOC*> vec_pDoc;
00930     vector<pair<size_t, size_t> > vecPairIndex;
00931     size_t iDoc;
00932     for (i = 0; i < CVGenes.size(); i++) {
00933         setGenes.insert(CVGenes[i]);
00934     }
00935 
00936     for (i = 0; i < PCL.GetGenes() - 1; i++) {
00937         if (setGenes.find(PCL.GetGene(i)) == setGenes.end()) {
00938             iDoc = 0;
00939             for (j = i + 1; j < PCL.GetGenes() - 1; j++) {
00940                 if (setGenes.find(PCL.GetGene(j)) == setGenes.end()) {
00941                     iDoc++;
00942                     vec_pDoc.push_back(CreateDoc(PCL, i, j, iDoc - 1));
00943                     vecPairIndex.push_back(pair<size_t, size_t> (i, j));
00944                 }
00945             }
00946             DOC** ppDoc;
00947             ppDoc = new DOC*[vec_pDoc.size()];
00948             copy(vec_pDoc.begin(), vec_pDoc.end(), ppDoc);
00949             vec_pDoc.clear();
00950             PATTERN pattern;
00951             pattern.doc = ppDoc;
00952             pattern.totdoc = iDoc;
00953 
00954             LABEL label = classify_struct_example(pattern, &structmodel,
00955                     &struct_parm);
00956             pair<size_t, size_t> tmpPair;
00957             float d;
00958             for (k = 0; k < vecPairIndex.size(); k++) {
00959                 tmpPair = vecPairIndex[k];
00960                 Values.Set(tmpPair.first, tmpPair.second, d + label.Class[k]);
00961             }
00962             vecPairIndex.resize(0);
00963             FreePattern(pattern);
00964         }
00965     }
00966 }
00967 
00968 
00969 // populate docs for each label gene pair from a vector of dat file names
00970 bool CSVMPERF::CreateDoc(vector<string>& vecstrDatasets,
00971              vector<SVMLabelPair*>& vecLabels,
00972              const vector<string>& LabelsGene,
00973              Sleipnir::CDat::ENormalize eNormalize){
00974   
00975   size_t i, j, k, iGene, jGene, iDoc, iWord, iWords;
00976   float d;
00977   vector<size_t> veciGene;  
00978   vector<WORD*> vec_pWord;
00979   vector<size_t> labelg2dat;  
00980   
00981   WORD* aWords;
00982   DOC* pRet;    
00983   Sleipnir::CDat Dat;
00984   
00985   iWords = vecstrDatasets.size();
00986   
00987   vec_pWord.reserve(vecLabels.size());
00988   
00989   // initialize all the WORDs
00990   for(i=0; i < vecLabels.size(); i++){
00991     aWords = new WORD[iWords + 1];
00992     
00993     // set wnum values
00994     for (k = 0; k < iWords; ++k) {
00995       //   cout<<i<<endl;
00996       aWords[k].wnum = k + 1;
00997       // asWords[ i ].wnum = 0;
00998     }
00999     aWords[k].wnum = 0;    
01000     vec_pWord[i] = aWords;
01001   }
01002   
01003   // initialize the gene mappings 
01004   labelg2dat.resize(LabelsGene.size());
01005   
01006   // now open up all datasets
01007   for(i=0; i < vecstrDatasets.size(); i++){
01008     if(!Dat.Open(vecstrDatasets[i].c_str())) {
01009       cerr << vecstrDatasets[i].c_str() << endl;
01010       cerr << "Could not open: " << vecstrDatasets[i] << endl;
01011       return false;
01012     }
01013     
01014     // normalize dat file
01015     if( eNormalize != Sleipnir::CDat::ENormalizeNone ){
01016       cerr << "Normalize input data" << endl;      
01017       Dat.Normalize( eNormalize );
01018     }
01019     
01020     cerr << "Open: " << vecstrDatasets[i] << endl;
01021     
01022     // construct gene name mapping
01023     for(k=0; k < LabelsGene.size(); k++){
01024       labelg2dat[k] = Dat.GetGene(LabelsGene[k]);
01025     }    
01027     
01028     for (j = 0; j < vecLabels.size(); j++) {
01029       aWords = vec_pWord[j];
01030             
01031       if( ((iGene = labelg2dat[vecLabels[j]->iidx]) == -1 ) || 
01032       ((jGene = labelg2dat[vecLabels[j]->jidx]) == -1 )){
01033     aWords[i].weight = 0;
01034     continue;
01035       }      
01036       
01037       if (!Sleipnir::CMeta::IsNaN(d = Dat.Get(iGene, jGene))) {
01038     aWords[i].weight = d;
01039       } else
01040     aWords[i].weight = 0;            
01041     }
01042   }
01043   
01044   // now create a Doc per label
01045   for (j = 0; j < vecLabels.size(); j++) {
01046     //pRet->fvec->words[0].weight;
01047     aWords = vec_pWord[j];
01048     pRet = create_example(j, 0, 0, 1, create_svector(aWords, "", 1));
01049     vecLabels[j]->pDoc = pRet;
01050     delete[] aWords;
01051   }
01052   
01053   return true;
01054 }
01055 
01056 SAMPLE* CSVMPERF::CreateSample(vector<SVMLabelPair*>& SVMLabels) {
01057     size_t i, j, iGene, iDoc;
01058     vector<DOC*> vec_pDoc;
01059     vector<double> vecClass;
01060     iDoc = 0;
01061     for (i = 0; i < SVMLabels.size(); i++) {
01062       iDoc++;
01063       vec_pDoc.push_back(SVMLabels[i]->pDoc);
01064       vecClass.push_back(SVMLabels[i]->Target);
01065     }
01066     
01067     DOC** ppDoc;
01068     ppDoc = new DOC*[vec_pDoc.size()];
01069     copy(vec_pDoc.begin(), vec_pDoc.end(), ppDoc);
01070     vec_pDoc.clear();
01071     PATTERN* pPattern = new PATTERN;
01072     pPattern->doc = ppDoc;
01073 
01074     pPattern->totdoc = iDoc;
01075     //   cout << "number of document=" << pPattern->totdoc << endl;
01076     LABEL* pLabel = new LABEL;
01077     double* aClass;
01078     aClass = new double[vecClass.size()];
01079     copy(vecClass.begin(), vecClass.end(), aClass);
01080     vecClass.clear();
01081     pLabel->Class = aClass;
01082     pLabel->totdoc = iDoc;
01083     
01084     EXAMPLE* aExample;
01085     aExample = new EXAMPLE[1];
01086     //cout<<"aExample @"<<aExample<<endl;
01087     aExample[0].x = *pPattern;
01088     aExample[0].y = *pLabel;
01089     SAMPLE* pSample = new SAMPLE;
01090     pSample->n = 1;
01091     pSample->examples = aExample;
01092     /* cout << "examples @" << pSample->examples << endl;
01093      cout<< "ppDoc="<<ppDoc<<endl;
01094      cout << "docs @" << pSample->examples[0].x.doc << endl;
01095      cout<<"done creating sample"<<endl;
01096      cout<<"sample @ "<<pSample<<endl;*/
01097     return pSample;
01098 }
01099 
01100 void CSVMPERF::Classify(Sleipnir::CDat &Results,
01101                   vector<SVMLabelPair*>& SVMLabels) {
01102     size_t i, iGene, iDoc;
01103     iDoc = 0;
01104     DOC** ppDoc;
01105     ppDoc = new DOC*[1];
01106     PATTERN pattern;
01107     pattern.doc = ppDoc;
01108     pattern.totdoc = 1;
01109     //cerr << "CLASSIFY classifying " << endl;
01110     LABEL label;
01111     for (i = 0; i < SVMLabels.size(); i++) {
01112       ppDoc[0] = SVMLabels[i]->pDoc;
01113       label
01114         = classify_struct_example(pattern, &structmodel,
01115                       &struct_parm);
01116       
01117       Results.Set(SVMLabels[i]->iidx, SVMLabels[i]->jidx, label.Class[0]);
01118     }
01119     
01120     delete ppDoc;
01121 }
01122 
01123 void CSVMPERF::FreeSample_leave_Doc(SAMPLE s){
01124   /* Frees the memory of sample s. */
01125   int i;
01126   for(i=0;i<s.n;i++) {
01127     free(s.examples[i].x.doc);
01128     free_label(s.examples[i].y);
01129   }
01130   free(s.examples);
01131 }
01132 
01133 // Platt's binary SVM Probablistic Output
01134 // Assume dec_values and labels have same dimensions and genes
01135 void CSVMPERF::sigmoid_train(Sleipnir::CDat& Results,
01136                  vector<SVMLabelPair*>& SVMLabels,
01137                  float& A, float& B){
01138     double prior1=0, prior0 = 0;
01139     size_t i, j, idx;
01140     float d, lab;
01141     
01142     int max_iter=100;   // Maximal number of iterations
01143     double min_step=1e-10;  // Minimal step taken in line search
01144     double sigma=1e-12; // For numerically strict PD of Hessian
01145     double eps=1e-5;
01146     vector<double> t;
01147     double fApB,p,q,h11,h22,h21,g1,g2,det,dA,dB,gd,stepsize;
01148     double newA,newB,newf,d1,d2;
01149     int iter; 
01150     vector<float> dec_values;
01151     
01152     // Negatives are values less than 0
01153     for(i = 0; i < SVMLabels.size(); i++)
01154       if( SVMLabels[i]->Target > 0 )
01155         prior1 += 1;
01156       else if(SVMLabels[i]->Target < 0 )
01157         prior0 += 1;
01158 
01159     // initialize size
01160     t.resize(prior0+prior1);
01161     
01162     
01163     // classify training examples
01164     LABEL label;
01165     size_t iGene, iDoc;
01166     iDoc = 0;
01167     DOC** ppDoc;
01168     ppDoc = new DOC*[1];
01169     PATTERN pattern;
01170     pattern.doc = ppDoc;
01171     pattern.totdoc = 1;
01172     
01173     dec_values.resize(t.size());
01174     
01175     idx = 0;
01176     for (i = 0; i < SVMLabels.size(); i++) {
01177       if( SVMLabels[i]->Target == 0 )
01178         continue;
01179       if( Sleipnir::CMeta::IsNaN(d = Results.Get(SVMLabels[i]->iidx, SVMLabels[i]->jidx)))
01180         continue;
01181       
01182       dec_values[idx] = d;
01183       idx++;
01184     }
01185     
01186     // Initial Point and Initial Fun Value
01187     A=0.0; B=log((prior0+1.0)/(prior1+1.0));
01188     double hiTarget=(prior1+1.0)/(prior1+2.0);
01189     double loTarget=1/(prior0+2.0);         
01190     double fval = 0.0;
01191     
01192     idx = 0;
01193     for(i = 0; i < SVMLabels.size(); i++){
01194       if ( SVMLabels[i]->Target > 0) t[idx]=hiTarget;
01195       else if( SVMLabels[i]->Target < 0 ) t[idx]=loTarget;
01196       else 
01197         continue;
01198       
01199       fApB = dec_values[idx]*A+B;
01200       if (fApB>=0)
01201         fval += t[idx]*fApB + log(1+exp(-fApB));
01202       else
01203         fval += (t[idx] - 1)*fApB +log(1+exp(fApB));
01204       
01205       idx++;
01206     }
01207     
01208     for (iter=0;iter<max_iter;iter++)
01209     {
01210         // Update Gradient and Hessian (use H' = H + sigma I)
01211         h11=sigma; // numerically ensures strict PD
01212         h22=sigma;
01213         h21=0.0;g1=0.0;g2=0.0;
01214         for (i=0; i < dec_values.size(); i++)
01215         {
01216             fApB = dec_values[i]*A+B;
01217             if (fApB >= 0)
01218             {
01219                 p=exp(-fApB)/(1.0+exp(-fApB));
01220                 q=1.0/(1.0+exp(-fApB));
01221             }
01222             else
01223             {
01224                 p=1.0/(1.0+exp(fApB));
01225                 q=exp(fApB)/(1.0+exp(fApB));
01226             }
01227             d2=p*q;
01228             h11+=dec_values[i]*dec_values[i]*d2;
01229             h22+=d2;
01230             h21+=dec_values[i]*d2;
01231             d1=t[i]-p;
01232             g1+=dec_values[i]*d1;
01233             g2+=d1;
01234         }
01235 
01236         // Stopping Criteria
01237         if (fabs(g1)<eps && fabs(g2)<eps)
01238             break;
01239 
01240         // Finding Newton direction: -inv(H') * g
01241         det=h11*h22-h21*h21;
01242         dA=-(h22*g1 - h21 * g2) / det;
01243         dB=-(-h21*g1+ h11 * g2) / det;
01244         gd=g1*dA+g2*dB;
01245 
01246 
01247         stepsize = 1;       // Line Search
01248         while (stepsize >= min_step)
01249         {
01250             newA = A + stepsize * dA;
01251             newB = B + stepsize * dB;
01252 
01253             // New function value
01254             newf = 0.0;
01255             for (i=0; i < dec_values.size(); i++)
01256             {
01257                 fApB = dec_values[i]*newA+newB;
01258                 if (fApB >= 0)
01259                     newf += t[i]*fApB + log(1+exp(-fApB));
01260                 else
01261                     newf += (t[i] - 1)*fApB +log(1+exp(fApB));
01262             }
01263             // Check sufficient decrease
01264             if (newf<fval+0.0001*stepsize*gd)
01265             {
01266                 A=newA;B=newB;fval=newf;
01267                 break;
01268             }
01269             else
01270                 stepsize = stepsize / 2.0;
01271         }
01272 
01273         if (stepsize < min_step)
01274         {
01275           cerr << "Line search fails in two-class probability estimates\n";
01276           break;
01277         }
01278     }
01279     
01280     if (iter>=max_iter)
01281       cerr << "Reaching maximal iterations in two-class probability estimates\n";
01282 }
01283 
01284 void CSVMPERF::sigmoid_predict(Sleipnir::CDat& Results, vector<SVMLabelPair*>& SVMLabels, float A, float B){
01285   size_t i;
01286   float d, fApB;
01287   
01288   for(i = 0; i < SVMLabels.size(); i++){
01289     if (!Sleipnir::CMeta::IsNaN(d = Results.Get(SVMLabels[i]->iidx, SVMLabels[i]->jidx))){
01290     fApB = d*A+B;
01291     // 1-p used later; avoid catastrophic cancellation
01292     if (fApB >= 0)
01293       Results.Set(SVMLabels[i]->iidx, SVMLabels[i]->jidx, exp(-fApB)/(1.0+exp(-fApB)));
01294     else
01295       Results.Set(SVMLabels[i]->iidx, SVMLabels[i]->jidx, 1.0/(1+exp(fApB)));      
01296     }    
01297   }
01298 }
01299 
01300 STRUCTMODEL CSVMPERF::read_struct_model_w_linear(char *file, STRUCT_LEARN_PARM *sparm){
01301   STRUCTMODEL sm;  
01302   MODEL *model;
01303   
01304   model = (MODEL *)my_malloc(sizeof(MODEL));  
01305   model->supvec = (DOC **)my_malloc(sizeof(DOC *)*2);
01306   model->alpha = (double *)my_malloc(sizeof(double)*2);
01307   model->index = NULL; /* index is not copied */
01308   model->supvec[0] = NULL;
01309   model->alpha[0] = 0.0;
01310   model->alpha[1] = 1.0;
01311   model->sv_num=2;
01312   model->b = 0;       
01313   model->totwords = 0;
01314   
01315   // read W model
01316   static const size_t c_iBuffer = 1024;
01317   char acBuffer[c_iBuffer];
01318   char* nameBuffer;
01319   vector<string> vecstrTokens;
01320   size_t i, extPlace;
01321   string Ext, FileName;
01322   size_t index = 0;
01323   ifstream ifsm;
01324   vector<float> SModel;
01325   
01326   ifsm.open(file);    
01327   while (!ifsm.eof()) {
01328     ifsm.getline(acBuffer, c_iBuffer - 1);
01329     acBuffer[c_iBuffer - 1] = 0;
01330     vecstrTokens.clear();
01331     Sleipnir::CMeta::Tokenize(acBuffer, vecstrTokens);
01332     if (vecstrTokens.empty())
01333       continue;
01334     if (vecstrTokens.size() > 1) {
01335       cerr << "Illegal model line (" << vecstrTokens.size() << "): "
01336        << acBuffer << endl;
01337       continue;
01338     }
01339     if (acBuffer[0] == '#') {
01340       cerr << "skipping " << acBuffer << endl;
01341     } else {
01342       SModel.push_back(atof(vecstrTokens[0].c_str()));
01343     }    
01344   }
01345   
01346   model->totwords = SModel.size();
01347   model->lin_weights=(double *)my_malloc(sizeof(double)*(model->totwords+1));
01348   model->kernel_parm.kernel_type = LINEAR;
01349   
01350   for(i = 0; i < (model->totwords+1); i++){    
01351     if(i == 0)
01352       model->lin_weights[i] = 0;
01353     else
01354       model->lin_weights[i] = SModel[i-1];    
01355   }
01356   
01357   model->supvec[1] = create_example(-1,0,0,0,
01358                     create_svector_n(model->lin_weights,
01359                              model->totwords,
01360                              NULL,1.0));
01361   
01362   sm.svm_model=model;
01363   
01364   sparm->loss_function=ERRORRATE;
01365   sparm->bias=0;
01366   sparm->bias_featurenum=0;
01367   sparm->num_features=sm.svm_model->totwords;
01368   sparm->truncate_fvec=(sm.svm_model->kernel_parm.kernel_type==LINEAR);
01369   
01370   if(sm.svm_model->kernel_parm.kernel_type == CUSTOM) /* double kernel */
01371     sparm->preimage_method=9;
01372   
01373   sm.invL=NULL;
01374   sm.expansion=NULL;
01375   sm.expansion_size=0;
01376   sm.sparse_kernel_type=0;
01377   sm.w=sm.svm_model->lin_weights;
01378   sm.sizePsi=sm.svm_model->totwords;
01379   
01380   if((sm.svm_model->kernel_parm.kernel_type!=LINEAR) && sparm->classify_dense)
01381     add_dense_vectors_to_model(sm.svm_model);
01382   return(sm);  
01383 }
01384   
01385 }
01386