Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "svmperf.h" 00024 #include "pclset.h" 00025 #include "dataset.h" 00026 #include "meta.h" 00027 #include "genome.h" 00028 #include "compactmatrix.h" 00029 #include <vector> 00030 #include <set> 00031 00032 #define SLACK_RESCALING 1 00033 #define MARGIN_RESCALING 2 00034 00035 namespace SVMLight { 00036 #include "svmperf.h" 00037 extern "C" { 00038 // void free_struct_model(STRUCTMODEL sm); 00039 void free_struct_sample(SAMPLE s); 00040 // void svm_learn_struct_joint_custom(SAMPLE sample, 00041 // STRUCT_LEARN_PARM *sparm, 00042 // LEARN_PARM *lparm, KERNEL_PARM *kparm, 00043 // STRUCTMODEL *sm); 00044 // SAMPLE read_struct_examples_sleipnir(DOC **all_docs, double*all_labels, int example_size, int total_features, STRUCT_LEARN_PARM *sparm); 00045 // void free_struct_model(STRUCTMODEL sm); 00046 // void free_struct_sample(SAMPLE s); 00047 // void set_struct_verbosity(long verb); 00048 // double estimate_r_delta_average(DOC **, long, KERNEL_PARM *); 00049 // MODEL *read_model(char *); 00050 LABEL classify_struct_example(PATTERN x, STRUCTMODEL *sm, 00051 STRUCT_LEARN_PARM *sparm); 00052 DOC* create_example(long, long, long, double, SVECTOR *); 00053 SVECTOR * create_svector(WORD *, char *, double); 00054 void set_struct_verbosity(long verb); 00055 00056 } 00057 00058 void CSVMPERF::SetVerbosity(size_t V) { 00059 struct_verbosity = (long) V; 00060 } 00061 00062 bool CSVMPERF::initialize() { 00063 00064 //set directionality 00065 00066 00067 /* set default */ 00068 00069 //Learn_parms 00070 struct_parm.C = 0.01; 00071 struct_parm.slack_norm = 1; 00072 struct_parm.epsilon = DEFAULT_EPS; 00073 struct_parm.custom_argc = 0; 00074 struct_parm.loss_function = 4; 00075 struct_parm.loss_type = DEFAULT_RESCALING; 00076 struct_parm.newconstretrain = 100; 00077 struct_parm.ccache_size = 5; 00078 struct_parm.batch_size = 100; 00079 struct_parm.bias_featurenum = 0; 00080 00081 //Learn_parms 00082 //strcpy (learn_parm.predfile, "trans_predictions"); 00083 strcpy(learn_parm.alphafile, ""); 00084 //verbosity=0;/*verbosity for svm_light*/ 00085 //struct_verbosity = 1; /*verbosity for struct learning portion*/ 00086 learn_parm.biased_hyperplane = 1; 00087 learn_parm.remove_inconsistent = 0; 00088 learn_parm.skip_final_opt_check = 0; 00089 learn_parm.svm_maxqpsize = 10; 00090 learn_parm.svm_newvarsinqp = 0; 00091 learn_parm.svm_iter_to_shrink = -9999; 00092 learn_parm.maxiter = 1000; 00093 learn_parm.kernel_cache_size = 40; 00094 learn_parm.svm_c = 99999999; /* overridden by struct_parm->C */ 00095 learn_parm.eps = 0.001; /* overridden by struct_parm->epsilon */ 00096 learn_parm.transduction_posratio = -1.0; 00097 learn_parm.svm_costratio = 1.0; 00098 learn_parm.svm_costratio_unlab = 1.0; 00099 learn_parm.svm_unlabbound = 1E-5; 00100 learn_parm.epsilon_crit = 0.001; 00101 learn_parm.epsilon_a = 1E-15; /* changed from 1e-15 */ 00102 learn_parm.compute_loo = 0; 00103 learn_parm.rho = 1.0; 00104 learn_parm.xa_depth = 0; 00105 00106 //kernel parms 00107 kernel_parm.kernel_type = 0; 00108 kernel_parm.poly_degree = 3; 00109 kernel_parm.rbf_gamma = 1.0; 00110 kernel_parm.coef_lin = 1; 00111 kernel_parm.coef_const = 1; 00112 strcpy(kernel_parm.custom, "empty"); 00113 00114 if (learn_parm.svm_iter_to_shrink == -9999) { 00115 learn_parm.svm_iter_to_shrink = 100; 00116 } 00117 00118 if ((learn_parm.skip_final_opt_check) 00119 && (kernel_parm.kernel_type == LINEAR)) { 00120 printf( 00121 "\nIt does not make sense to skip the final optimality check for linear kernels.\n\n"); 00122 learn_parm.skip_final_opt_check = 0; 00123 } 00124 00125 //struct parms 00126 struct_parm.bias = 0; 00127 struct_parm.prec_rec_k_frac = 0.5; 00128 struct_parm.sparse_kernel_type = LINEAR; 00129 struct_parm.sparse_kernel_size = 500; 00130 struct_parm.shrinking = 1; 00131 strcpy(struct_parm.sparse_kernel_file, ""); 00132 /* set number of features to -1, indicating that it will be computed 00133 in init_struct_model() */ 00134 struct_parm.num_features = -1; 00135 00136 return true; 00137 } 00138 00139 bool CSVMPERF::parms_check() { 00140 if ((learn_parm.skip_final_opt_check) && (learn_parm.remove_inconsistent)) { 00141 fprintf( 00142 stderr, 00143 "\nIt is necessary to do the final optimality check when removing inconsistent \nexamples.\n"); 00144 return false; 00145 } 00146 if ((learn_parm.svm_maxqpsize < 2)) { 00147 fprintf( 00148 stderr, 00149 "\nMaximum size of QP-subproblems not in valid range: %ld [2..]\n", 00150 learn_parm.svm_maxqpsize); 00151 return false; 00152 } 00153 if ((learn_parm.svm_maxqpsize < learn_parm.svm_newvarsinqp)) { 00154 fprintf( 00155 stderr, 00156 "\nMaximum size of QP-subproblems [%ld] must be larger than the number of\n", 00157 learn_parm.svm_maxqpsize); 00158 fprintf( 00159 stderr, 00160 "new variables [%ld] entering the working set in each iteration.\n", 00161 learn_parm.svm_newvarsinqp); 00162 return false; 00163 } 00164 if (learn_parm.svm_iter_to_shrink < 1) { 00165 fprintf( 00166 stderr, 00167 "\nMaximum number of iterations for shrinking not in valid range: %ld [1,..]\n", 00168 learn_parm.svm_iter_to_shrink); 00169 return false; 00170 } 00171 if (struct_parm.C < 0) { 00172 fprintf( 00173 stderr, 00174 "\nTrade-off between training error and margin is not set (C<0)!\nC value will be set to default value. Clight = Cpef * 100 / n \n"); 00175 } 00176 if (learn_parm.transduction_posratio > 1) { 00177 fprintf(stderr, 00178 "\nThe fraction of unlabeled examples to classify as positives must\n"); 00179 fprintf(stderr, "be less than 1.0 !!!\n\n"); 00180 return false; 00181 } 00182 if (learn_parm.svm_costratio <= 0) { 00183 fprintf(stderr, 00184 "\nThe COSTRATIO parameter must be greater than zero!\n\n"); 00185 return false; 00186 } 00187 if (struct_parm.epsilon <= 0) { 00188 fprintf(stderr, 00189 "\nThe epsilon parameter must be greater than zero!\n\n"); 00190 return false; 00191 } 00192 if ((struct_parm.slack_norm < 1) || (struct_parm.slack_norm > 2)) { 00193 fprintf(stderr, 00194 "\nThe norm of the slacks must be either 1 (L1-norm) or 2 (L2-norm)!\n\n"); 00195 return false; 00196 } 00197 00198 if ((struct_parm.loss_type != SLACK_RESCALING) && (struct_parm.loss_type 00199 != MARGIN_RESCALING)) { 00200 fprintf( 00201 stderr, 00202 "\nThe loss type must be either 1 (slack rescaling) or 2 (margin rescaling)!\n\n"); 00203 return false; 00204 } 00205 00206 if (learn_parm.rho < 0) { 00207 fprintf(stderr, 00208 "\nThe parameter rho for xi/alpha-estimates and leave-one-out pruning must\n"); 00209 fprintf(stderr, 00210 "be greater than zero (typically 1.0 or 2.0, see T. Joachims, Estimating the\n"); 00211 fprintf(stderr, 00212 "Generalization Performance of an SVM Efficiently, ICML, 2000.)!\n\n"); 00213 return false; 00214 } 00215 if ((learn_parm.xa_depth < 0) || (learn_parm.xa_depth > 100)) { 00216 fprintf(stderr, 00217 "\nThe parameter depth for ext. xi/alpha-estimates must be in [0..100] (zero\n"); 00218 fprintf(stderr, 00219 "for switching to the conventional xa/estimates described in T. Joachims,\n"); 00220 fprintf( 00221 stderr, 00222 "Estimating the Generalization Performance of an SVM Efficiently, ICML, 2000.)\n"); 00223 return false; 00224 } 00225 00226 /* Note that the validity of the value for struct_parm->prec_rec_k_frac in 00227 relation to #pos is checked in read_struct_examples() */ 00228 if (struct_parm.prec_rec_k_frac < 0) { 00229 fprintf(stderr, 00230 "\nThe value of option --k must be greater then zero!\n\n"); 00231 return false; 00232 } 00233 00234 return true; 00235 } 00236 00237 DOC* CSVMPERF::CreateDoc(Sleipnir::CPCLSet &PCLSet, size_t iGene, size_t iDoc) { 00238 WORD* aWords; 00239 size_t i, j, iWord, iWords, iPCL, iExp; 00240 float d; 00241 DOC* pRet; 00242 pRet->fvec->words[0].weight; 00243 //get number of features 00244 iWords = 0; 00245 for (i = 0; i < PCLSet.GetPCLs(); i++) { 00246 // cout<<"CD:PCLSET= "<<i<<endl; 00247 // cout<<"CD:numExp= "<<PCLSet.Get(i).GetExperiments()<<endl; 00248 iWords += PCLSet.Get(i).GetExperiments(); 00249 } 00250 // cout << "CD:iwords=" << iWords << endl; 00251 aWords = new WORD[iWords + 1]; 00252 //number the words 00253 for (i = 0; i < iWords; ++i) { 00254 // cout<<i<<endl; 00255 aWords[i].wnum = i + 1; 00256 // asWords[ i ].wnum = 0; 00257 } 00258 aWords[i].wnum = 0; 00259 //get the values; 00260 iWord = 0; 00261 for (i = 0; i < PCLSet.GetPCLs(); i++) { 00262 iExp = PCLSet.Get(i).GetExperiments(); 00263 for (j = 0; j < iExp; j++) { 00264 // cout<<"CD:iWord="<<iWord<<endl; 00265 if (!Sleipnir::CMeta::IsNaN(d = PCLSet.Get(i, iGene, j))) { 00266 // if (i==0 && j==0) 00267 // cout<<"First value is "<<d<<endl; 00268 aWords[iWord].weight = d; 00269 } else 00270 aWords[iWord].weight = 0; 00271 iWord++; 00272 } 00273 } 00274 pRet = create_example(iDoc, 0, 0, 1, create_svector(aWords, "", 1)); 00275 delete[] aWords; 00276 // cout<<"done creating DOC"<<endl; 00277 return pRet; 00278 } 00279 //For single genes usign single PCL 00280 00281 DOC* CSVMPERF::CreateDoc(Sleipnir::CPCL &PCL, size_t iGene, size_t iDoc) { 00282 WORD* aWords; 00283 size_t i, j, iWord, iWords, iPCL, iExp; 00284 float d; 00285 DOC* pRet; 00286 pRet->fvec->words[0].weight; 00287 //get number of features 00288 iWords = PCL.GetExperiments(); 00289 //cerr<<"Newing WORDS "<<(iWords+1)*sizeof(WORD)<<endl; 00290 aWords = new WORD[iWords + 1]; 00291 //set the words 00292 for (i = 0; i < iWords; ++i) { 00293 aWords[i].wnum = i + 1; 00294 if (!Sleipnir::CMeta::IsNaN(d = PCL.Get(iGene, i))) 00295 aWords[i].weight = d; 00296 else 00297 aWords[i].weight = 0; 00298 } 00299 aWords[i].wnum = 0; 00300 // cerr<<"START Create Example"<<endl; 00301 pRet = create_example(iDoc, 0, 0, 1, create_svector(aWords, "", 1)); 00302 //cerr<<"END create example"<<endl; 00303 delete[] aWords; 00304 return pRet; 00305 } 00306 //Single Gene using a Dat for data 00307 00308 DOC* CSVMPERF::CreateDoc(Sleipnir::CDat& Dat, size_t iGene, size_t iDoc) { 00309 WORD* aWords; 00310 size_t i, j, iWord, iWords; 00311 float d; 00312 DOC* pRet; 00313 pRet->fvec->words[0].weight; 00314 //get number of features 00315 iWords = Dat.GetGenes(); 00316 // cout << "CD:iwords=" << iWords << endl; 00317 aWords = new WORD[iWords + 1]; 00318 //number the words 00319 for (i = 0; i < iWords; ++i) { 00320 // cout<<i<<endl; 00321 aWords[i].wnum = i + 1; 00322 // asWords[ i ].wnum = 0; 00323 } 00324 aWords[i].wnum = 0; 00325 //get the values; 00326 iWord = 0; 00327 for (i = 0; i < Dat.GetGenes(); i++) { 00328 if (!Sleipnir::CMeta::IsNaN(d = Dat.Get(iGene, i))) { 00329 // if (i==0 && j==0) 00330 // cout<<"First value is "<<d<<endl; 00331 aWords[iWord].weight = d; 00332 } else 00333 aWords[iWord].weight = 0; 00334 iWord++; 00335 } 00336 pRet = create_example(iDoc, 0, 0, 1, create_svector(aWords, "", 1)); 00337 delete[] aWords; 00338 // cout<<"done creating DOC"<<endl; 00339 return pRet; 00340 } 00341 00342 //Create Sample functions for single genes 00343 00344 SAMPLE* CSVMPERF::CreateSample(Sleipnir::CPCLSet &PCLSet, 00345 vector<SVMLabel> SVMLabels) { 00346 size_t i, j, iGene, iDoc; 00347 vector<DOC*> vec_pDoc; 00348 vector<double> vecClass; 00349 vector<size_t> veciGene; 00350 iDoc = 0; 00351 float numPositives, numNegatives; 00352 numPositives = numNegatives = 0; 00353 for (i = 0; i < SVMLabels.size(); i++) { 00354 // cout<< "processing gene " << SVMLabels[i].GeneName << endl; 00355 if (!SVMLabels[i].hasIndex) { 00356 SVMLabels[i].SetIndex(PCLSet.GetGene(SVMLabels[i].GeneName)); 00357 } 00358 iGene = SVMLabels[i].index; 00359 // cout << SVMLabels[i].GeneName<<" gene at location "<<iGene << endl; 00360 if (iGene != -1) { 00361 // cout << "creating doc" << endl; 00362 iDoc++; 00363 vec_pDoc.push_back(CreateDoc(PCLSet, iGene, iDoc - 1)); 00364 vecClass.push_back(SVMLabels[i].Target); 00365 } 00366 } 00367 00368 DOC** ppDoc; 00369 ppDoc = new DOC*[vec_pDoc.size()]; 00370 copy(vec_pDoc.begin(), vec_pDoc.end(), ppDoc); 00371 vec_pDoc.clear(); 00372 PATTERN* pPattern = new PATTERN; 00373 pPattern->doc = ppDoc; 00374 00375 pPattern->totdoc = iDoc; 00376 // cout << "number of document=" << pPattern->totdoc << endl; 00377 LABEL* pLabel = new LABEL; 00378 double* aClass; 00379 aClass = new double[vecClass.size()]; 00380 copy(vecClass.begin(), vecClass.end(), aClass); 00381 vecClass.clear(); 00382 pLabel->Class = aClass; 00383 pLabel->totdoc = iDoc; 00384 00385 EXAMPLE* aExample; 00386 aExample = new EXAMPLE[1]; 00387 //cout<<"aExample @"<<aExample<<endl; 00388 aExample[0].x = *pPattern; 00389 aExample[0].y = *pLabel; 00390 SAMPLE* pSample = new SAMPLE; 00391 pSample->n = 1; 00392 pSample->examples = aExample; 00393 /* cout << "examples @" << pSample->examples << endl; 00394 cout<< "ppDoc="<<ppDoc<<endl; 00395 cout << "docs @" << pSample->examples[0].x.doc << endl; 00396 cout<<"done creating sample"<<endl; 00397 cout<<"sample @ "<<pSample<<endl;*/ 00398 return pSample; 00399 } 00400 00401 SAMPLE* CSVMPERF::CreateSample(Sleipnir::CPCL &PCL, vector<SVMLabel> SVMLabels) { 00402 size_t i, j, iGene, iDoc; 00403 // cerr<<"CREATE pDoc vector"<<endl; 00404 vector<DOC*> vec_pDoc; 00405 //cerr << "RESERVE pDoc"<<endl; 00406 vec_pDoc.reserve(SVMLabels.size()); 00407 //cerr<<"CREATE class"<<endl; 00408 vector<double> vecClass; 00409 //cerr<<"RESERVE class"<<endl; 00410 vecClass.reserve(SVMLabels.size()); 00411 iDoc = 0; 00412 float numPositives, numNegatives; 00413 numPositives = numNegatives = 0; 00414 for (i = 0; i < SVMLabels.size(); i++) { 00415 // cout<< "processing gene " << SVMLabels[i].GeneName << endl; 00416 if (!SVMLabels[i].hasIndex) { 00417 SVMLabels[i].SetIndex(PCL.GetGene(SVMLabels[i].GeneName)); 00418 } 00419 iGene = SVMLabels[i].index; 00420 // cout << SVMLabels[i].GeneName<<" gene at location "<<iGene << endl; 00421 if (iGene != -1) { 00422 // cout << "creating doc" << endl; 00423 iDoc++; 00424 vec_pDoc.push_back(CreateDoc(PCL, iGene, iDoc - 1)); 00425 vecClass.push_back(SVMLabels[i].Target); 00426 } 00427 } 00428 00429 DOC** ppDoc; 00430 //cerr<<"NEW ppDoc"<<endl; 00431 ppDoc = new DOC*[vec_pDoc.size()]; 00432 copy(vec_pDoc.begin(), vec_pDoc.end(), ppDoc); 00433 vec_pDoc.clear(); 00434 //cerr<<"NEW Pattern"<<endl; 00435 PATTERN* pPattern = new PATTERN; 00436 pPattern->doc = ppDoc; 00437 00438 pPattern->totdoc = iDoc; 00439 LABEL* pLabel = new LABEL; 00440 double* aClass; 00441 cerr << "NEW Class array" << endl; 00442 aClass = new double[vecClass.size()]; 00443 copy(vecClass.begin(), vecClass.end(), aClass); 00444 vecClass.clear(); 00445 pLabel->Class = aClass; 00446 pLabel->totdoc = iDoc; 00447 00448 EXAMPLE* aExample; 00449 //cerr<<"NEW Example"<<endl; 00450 aExample = new EXAMPLE[1]; 00451 //cout<<"aExample @"<<aExample<<endl; 00452 aExample[0].x = *pPattern; 00453 aExample[0].y = *pLabel; 00454 //cerr<<"NEW Sample"<<endl; 00455 SAMPLE* pSample = new SAMPLE; 00456 pSample->n = 1; 00457 pSample->examples = aExample; 00458 /* cout << "examples @" << pSample->examples << endl; 00459 cout<< "ppDoc="<<ppDoc<<endl; 00460 cout << "docs @" << pSample->examples[0].x.doc << endl; 00461 cout<<"done creating sample"<<endl; 00462 cout<<"sample @ "<<pSample<<endl;*/ 00463 return pSample; 00464 //cerr<<"DONE CreateSample"<<endl; 00465 } 00466 00467 SAMPLE** CSVMPERF::CreateSampleBootStrap(Sleipnir::CPCL &PCL, 00468 vector<SVMLabel>& SVMLabels, vector<vector<size_t> > vecvecIndex) { 00469 00470 size_t i, j, iGene, iDoc; 00471 vector<DOC*> vec_pDoc; 00472 vec_pDoc.reserve(SVMLabels.size()); 00473 vector<vector<double> > vecvecClass; 00474 vector<double> vecClass; 00475 vecClass.reserve(SVMLabels.size()); 00476 iDoc = 0; 00477 float numPositives, numNegatives; 00478 numPositives = numNegatives = 0; 00479 //Creat all the docs once 00480 for (i = 0; i < SVMLabels.size(); i++) { 00481 //the labels will have an index 00482 iGene = SVMLabels[i].index; 00483 if (iGene != -1) { 00484 iDoc++; 00485 vec_pDoc.push_back(CreateDoc(PCL, iGene, iDoc - 1)); 00486 vecClass.push_back(SVMLabels[i].Target); 00487 00488 } 00489 } 00490 size_t numBootStraps = vecvecIndex.size(); 00491 SAMPLE** ppSample = new SAMPLE*[numBootStraps]; 00492 DOC** ppDoc; 00493 for (i = 0; i < numBootStraps; i++) { 00494 //get a new ppDoc 00495 ppDoc = new DOC*[vecvecIndex[i].size()]; 00496 for (j = 0; j < vecvecIndex[i].size(); j++) { 00497 ppDoc[j] = vec_pDoc[vecvecIndex[i][j]]; //assign the pointer 00498 } 00499 //set up the pattern 00500 PATTERN* pPattern = new PATTERN; 00501 pPattern->doc = ppDoc; 00502 pPattern->totdoc = iDoc; 00503 00504 //set up the labels 00505 LABEL* pLabel = new LABEL; 00506 double* aClass; 00507 aClass = new double[vecvecIndex[i].size()]; 00508 for (j = 0; j < vecvecIndex[i].size(); j++) { 00509 aClass[j] = SVMLabels[vecvecIndex[i][j]].Target; 00510 } 00511 00512 pLabel->Class = aClass; 00513 pLabel->totdoc = iDoc; 00514 00515 //set up the Example 00516 EXAMPLE* aExample; 00517 aExample = new EXAMPLE[1]; 00518 aExample[0].x = *pPattern; 00519 aExample[0].y = *pLabel; 00520 00521 //set up the Sample 00522 ppSample[i] = new SAMPLE; 00523 ppSample[i]->n = 1; 00524 ppSample[i]->examples = aExample; 00525 } 00526 return ppSample; 00527 } 00528 00529 SAMPLE * CSVMPERF::CreateSample(Sleipnir::CDat& Dat, vector<SVMLabel> SVMLabels) { 00530 size_t i, j, iGene, iDoc; 00531 vector<DOC*> vec_pDoc; 00532 vector<double> vecClass; 00533 vector<size_t> veciGene; 00534 iDoc = 0; 00535 float numPositives, numNegatives; 00536 numPositives = numNegatives = 0; 00537 for (i = 0; i < SVMLabels.size(); i++) { 00538 // cout<< "processing gene " << SVMLabels[i].GeneName << endl; 00539 iGene = Dat.GetGene(SVMLabels[i].GeneName); 00540 // cout << SVMLabels[i].GeneName<<" gene at location "<<iGene << endl; 00541 if (iGene != -1) { 00542 // cout << "creating doc" << endl; 00543 iDoc++; 00544 vec_pDoc.push_back(CreateDoc(Dat, iGene, iDoc - 1)); 00545 vecClass.push_back(SVMLabels[i].Target); 00546 } 00547 } 00548 00549 DOC** ppDoc; 00550 ppDoc = new DOC*[vec_pDoc.size()]; 00551 copy(vec_pDoc.begin(), vec_pDoc.end(), ppDoc); 00552 vec_pDoc.clear(); 00553 PATTERN* pPattern = new PATTERN; 00554 pPattern->doc = ppDoc; 00555 00556 pPattern->totdoc = iDoc; 00557 // cout << "number of document=" << pPattern->totdoc << endl; 00558 LABEL* pLabel = new LABEL; 00559 double* aClass; 00560 aClass = new double[vecClass.size()]; 00561 copy(vecClass.begin(), vecClass.end(), aClass); 00562 vecClass.clear(); 00563 pLabel->Class = aClass; 00564 pLabel->totdoc = iDoc; 00565 00566 EXAMPLE* aExample; 00567 aExample = new EXAMPLE[1]; 00568 //cout<<"aExample @"<<aExample<<endl; 00569 aExample[0].x = *pPattern; 00570 aExample[0].y = *pLabel; 00571 SAMPLE* pSample = new SAMPLE; 00572 pSample->n = 1; 00573 pSample->examples = aExample; 00574 /* cout << "examples @" << pSample->examples << endl; 00575 cout<< "ppDoc="<<ppDoc<<endl; 00576 cout << "docs @" << pSample->examples[0].x.doc << endl; 00577 cout<<"done creating sample"<<endl; 00578 cout<<"sample @ "<<pSample<<endl;*/ 00579 return pSample; 00580 } 00581 00582 //Single gene classification 00583 00584 vector<Result> CSVMPERF::Classify(Sleipnir::CPCL &PCL, 00585 vector<SVMLabel> SVMLabels) { 00586 size_t i, j, iGene, iDoc; 00587 vector<DOC*> vec_pDoc; 00588 vector<double> vecClass; 00589 vector<Result> vecResult; 00590 iDoc = 0; 00591 DOC** ppDoc; 00592 ppDoc = new DOC*[1]; 00593 PATTERN pattern; 00594 pattern.doc = ppDoc; 00595 pattern.totdoc = 1; 00596 //cerr << "CLASSIFY classifying " << endl; 00597 LABEL label; 00598 for (i = 0; i < SVMLabels.size(); i++) { 00599 if (!SVMLabels[i].hasIndex) { 00600 SVMLabels[i].SetIndex(PCL.GetGene(SVMLabels[i].GeneName)); 00601 } 00602 iGene = SVMLabels[i].index; 00603 // cout << "CLASS gene=" << iGene << endl; 00604 if (iGene != -1) { 00605 iDoc++; 00606 00607 //cout << "CLASS iDOC=" << iDoc << endl; 00608 ppDoc[0] = CreateDoc(PCL, iGene, iDoc); 00609 label 00610 = classify_struct_example(pattern, &structmodel, 00611 &struct_parm); 00612 00613 vecClass.push_back(SVMLabels[i].Target); 00614 vecResult.resize(iDoc); 00615 vecResult[iDoc - 1].GeneName = SVMLabels[i].GeneName; 00616 vecResult[iDoc - 1].Target = SVMLabels[i].Target; 00617 vecResult[iDoc - 1].Value = label.Class[0]; 00618 //cerr<<"CLASSIFY Called FreeDoc"<<endl; 00619 FreeDoc(ppDoc[0]); 00620 //cerr<<"CLASSIFY End FreeDoc"<<endl; 00621 } 00622 } 00623 00624 delete[] ppDoc; 00625 return vecResult; 00626 } 00627 00628 vector<Result> CSVMPERF::Classify(Sleipnir::CPCLSet &PCLSet, 00629 vector<SVMLabel> SVMLabels) { 00630 size_t i, j, iGene, iDoc; 00631 vector<DOC*> vec_pDoc; 00632 vector<double> vecClass; 00633 vector<Result> vecResult; 00634 iDoc = 0; 00635 for (i = 0; i < SVMLabels.size(); i++) { 00636 iGene = PCLSet.GetGene(SVMLabels[i].GeneName); 00637 // cout << "CLASS gene=" << iGene << endl; 00638 if (iGene != -1) { 00639 iDoc++; 00640 // cout << "CLASS iDOC=" << iDoc << endl; 00641 vec_pDoc.push_back(CreateDoc(PCLSet, iGene, iDoc)); 00642 vecClass.push_back(SVMLabels[i].Target); 00643 vecResult.resize(iDoc); 00644 vecResult[iDoc - 1].GeneName = SVMLabels[i].GeneName; 00645 vecResult[iDoc - 1].Target = SVMLabels[i].Target; 00646 } 00647 } 00648 DOC** ppDoc; 00649 ppDoc = new DOC*[vec_pDoc.size()]; 00650 copy(vec_pDoc.begin(), vec_pDoc.end(), ppDoc); 00651 vec_pDoc.clear(); 00652 PATTERN pattern; 00653 pattern.doc = ppDoc; 00654 pattern.totdoc = iDoc; 00655 00656 LABEL label = classify_struct_example(pattern, &structmodel, &struct_parm); 00657 // cout << "label totdoc=" << label.totdoc << endl; 00658 for (i = 0; i < label.totdoc; i++) { 00659 vecResult[i].Value = label.Class[i]; 00660 // cout << "CLASS: i=" << i << " value=" << vecResult[i].Value << endl; 00661 } 00662 FreePattern(pattern); 00663 return vecResult; 00664 } 00665 00666 vector<Result> CSVMPERF::Classify(Sleipnir::CDat &Dat, 00667 vector<SVMLabel> SVMLabels) { 00668 size_t i, j, iGene, iDoc; 00669 vector<DOC*> vec_pDoc; 00670 vector<double> vecClass; 00671 vector<Result> vecResult; 00672 iDoc = 0; 00673 for (i = 0; i < SVMLabels.size(); i++) { 00674 iGene = Dat.GetGene(SVMLabels[i].GeneName); 00675 // cout << "CLASS gene=" << iGene << endl; 00676 if (iGene != -1) { 00677 iDoc++; 00678 // cout << "CLASS iDOC=" << iDoc << endl; 00679 vec_pDoc.push_back(CreateDoc(Dat, iGene, iDoc)); 00680 vecClass.push_back(SVMLabels[i].Target); 00681 vecResult.resize(iDoc); 00682 vecResult[iDoc - 1].GeneName = SVMLabels[i].GeneName; 00683 vecResult[iDoc - 1].Target = SVMLabels[i].Target; 00684 } 00685 } 00686 DOC** ppDoc; 00687 ppDoc = new DOC*[vec_pDoc.size()]; 00688 copy(vec_pDoc.begin(), vec_pDoc.end(), ppDoc); 00689 vec_pDoc.clear(); 00690 PATTERN pattern; 00691 pattern.doc = ppDoc; 00692 pattern.totdoc = iDoc; 00693 00694 LABEL label = classify_struct_example(pattern, &structmodel, &struct_parm); 00695 // cout << "label totdoc=" << label.totdoc << endl; 00696 for (i = 0; i < label.totdoc; i++) { 00697 vecResult[i].Value = label.Class[i]; 00698 // cout << "CLASS: i=" << i << " value=" << vecResult[i].Value << endl; 00699 } 00700 FreePattern(pattern); 00701 return vecResult; 00702 } 00703 00704 //Create DOC for pais of genes 00705 //For pairs of genes using PCLSet 00706 00707 DOC* CSVMPERF::CreateDoc(Sleipnir::CPCL &PCL, size_t iGene, size_t jGene, 00708 size_t iDoc) { 00709 WORD* aWords; 00710 size_t i, j, iWord, iWords, iPCL, iExp; 00711 float d, e; 00712 DOC* pRet; 00713 pRet->fvec->words[0].weight; 00714 //get number of features 00715 00716 iWords = PCL.GetExperiments(); 00717 00718 // cout << "CD:iwords=" << iWords << endl; 00719 aWords = new WORD[iWords + 1]; 00720 //number the words 00721 for (i = 0; i < iWords; ++i) { 00722 // cout<<i<<endl; 00723 aWords[i].wnum = i + 1; 00724 // asWords[ i ].wnum = 0; 00725 } 00726 aWords[i].wnum = 0; 00727 //get the values; 00728 iWord = 0; 00729 00730 for (j = 0; j < PCL.GetExperiments(); j++) { 00731 // cout<<"CD:iWord="<<iWord<<endl; 00732 if (!Sleipnir::CMeta::IsNaN(d = PCL.Get(iGene, j)) 00733 && !Sleipnir::CMeta::IsNaN(e = PCL.Get(jGene, j))) { 00734 // if (i==0 && j==0) 00735 // cout<<"First value is "<<d<<endl; 00736 aWords[iWord].weight = d * e; 00737 } else 00738 aWords[iWord].weight = 0; 00739 iWord++; 00740 } 00741 00742 pRet = create_example(iDoc, 0, 0, 1, create_svector(aWords, "", 1)); 00743 delete[] aWords; 00744 // cout<<"done creating DOC"<<endl; 00745 return pRet; 00746 } 00747 00748 //Create Sample for pairs of genes, wraps the above 2 functions 00749 00750 SAMPLE* CSVMPERF::CreateSample(Sleipnir::CPCL &PCL, Sleipnir::CDat& Answers, 00751 const vector<string>& CVGenes) { 00752 size_t i, j, iGene, jGene, iDoc; 00753 vector<DOC*> vec_pDoc; 00754 vector<double> vecClass; 00755 vector<size_t> veciGene; 00756 Sleipnir::CPCL* pP = &PCL; 00757 iDoc = 0; 00758 float numPositives, numNegatives; 00759 numPositives = numNegatives = 0; 00760 set<string> setGenes; 00761 set<string>::iterator iterSet; 00762 float w; 00763 for (i = 0; i < CVGenes.size(); i++) { 00764 setGenes.insert(CVGenes[i]); 00765 } 00766 for (i = 0; i < Answers.GetGenes() - 1; i++) { 00767 if ((setGenes.find(Answers.GetGene(i)) != setGenes.end()) && ((iGene 00768 = PCL.GetGene(Answers.GetGene(i))) != -1)) { 00769 for (j = i + 1; j < Answers.GetGenes(); j++) { 00770 if ((setGenes.find(Answers.GetGene(j)) != setGenes.end()) 00771 && ((jGene = PCL.GetGene(Answers.GetGene(j))) != -1)) { 00772 if (!Sleipnir::CMeta::IsNaN(w = Answers.Get(i, j))) { 00773 iDoc++; 00774 vec_pDoc.push_back(CreateDoc(PCL, iGene, jGene, iDoc 00775 - 1)); 00776 00777 } 00778 } 00779 } 00780 } 00781 } 00782 00783 DOC** ppDoc; 00784 ppDoc = new DOC*[vec_pDoc.size()]; 00785 copy(vec_pDoc.begin(), vec_pDoc.end(), ppDoc); 00786 vec_pDoc.clear(); 00787 PATTERN* pPattern = new PATTERN; 00788 pPattern->doc = ppDoc; 00789 00790 pPattern->totdoc = iDoc; 00791 LABEL* pLabel = new LABEL; 00792 double* aClass; 00793 aClass = new double[vecClass.size()]; 00794 copy(vecClass.begin(), vecClass.end(), aClass); 00795 vecClass.clear(); 00796 pLabel->Class = aClass; 00797 pLabel->totdoc = iDoc; 00798 00799 EXAMPLE* aExample; 00800 aExample = new EXAMPLE[1]; 00801 aExample[0].x = *pPattern; 00802 aExample[0].y = *pLabel; 00803 SAMPLE* pSample = new SAMPLE; 00804 pSample->n = 1; 00805 pSample->examples = aExample; 00806 return pSample; 00807 } 00808 00809 void CSVMPERF::Classify(Sleipnir::CPCL& PCL, Sleipnir::CDat& Answers, 00810 Sleipnir::CDat& Values, Sleipnir::CDat& Counts, 00811 const vector<string>& CVGenes) { 00812 size_t i, j, iGene, jGene, iDoc; 00813 set<string> setGenes; 00814 set<string>::iterator iterSet; 00815 vector<DOC*> vec_pDoc; 00816 vector<pair<size_t, size_t> > vecPairIndex; 00817 iDoc = 0; 00818 for (i = 0; i < CVGenes.size(); i++) { 00819 setGenes.insert(CVGenes[i]); 00820 } 00821 00822 cout << "the number of genes to be classified is " << setGenes.size() 00823 << endl; 00824 for (i = 0; i < Answers.GetGenes() - 1; i++) { 00825 if ((setGenes.find(Answers.GetGene(i)) != setGenes.end()) && ((iGene 00826 = PCL.GetGene(Answers.GetGene(i))) != -1)) { 00827 for (j = i + 1; j < Answers.GetGenes(); j++) { 00828 if ((setGenes.find(Answers.GetGene(j)) != setGenes.end()) 00829 && ((jGene = PCL.GetGene(Answers.GetGene(j))) != -1)) { 00830 if (!Sleipnir::CMeta::IsNaN(Answers.Get(i, j))) { 00831 iDoc++; 00832 vec_pDoc.push_back(CreateDoc(PCL, iGene, jGene, iDoc 00833 - 1)); 00834 vecPairIndex.push_back(pair<size_t, size_t> (i, j)); 00835 } 00836 } 00837 } 00838 } 00839 } 00840 DOC** ppDoc; 00841 ppDoc = new DOC*[vec_pDoc.size()]; 00842 copy(vec_pDoc.begin(), vec_pDoc.end(), ppDoc); 00843 vec_pDoc.clear(); 00844 PATTERN pattern; 00845 pattern.doc = ppDoc; 00846 pattern.totdoc = iDoc; 00847 00848 LABEL label = classify_struct_example(pattern, &structmodel, &struct_parm); 00849 pair<size_t, size_t> tmpPair; 00850 float d; 00851 for (i = 0; i < vecPairIndex.size(); i++) { 00852 tmpPair = vecPairIndex[i]; 00853 if (!Sleipnir::CMeta::IsNaN(d = Values.Get(tmpPair.first, 00854 tmpPair.second))) 00855 Values.Set(tmpPair.first, tmpPair.second, d + label.Class[i]); 00856 else 00857 Values.Set(tmpPair.first, tmpPair.second, label.Class[i]); 00858 if (!Sleipnir::CMeta::IsNaN(d = Counts.Get(tmpPair.first, 00859 tmpPair.second))) 00860 Counts.Set(tmpPair.first, tmpPair.second, d + 1); 00861 else 00862 Counts.Set(tmpPair.first, tmpPair.second, 1); 00863 } 00864 FreePattern(pattern); 00865 00866 } 00867 00868 void CSVMPERF::ClassifyAll(Sleipnir::CPCL& PCL, Sleipnir::CDat& Values, 00869 Sleipnir::CDat& Counts, const vector<string>& CVGenes) { 00870 size_t iGene, jGene, i, j, k; 00871 string strGeneOne, strGeneTwo; 00872 set<string> setGenes; 00873 set<string>::iterator iterSet; 00874 vector<DOC*> vec_pDoc; 00875 vector<pair<size_t, size_t> > vecPairIndex; 00876 size_t iDoc; 00877 for (i = 0; i < CVGenes.size(); i++) { 00878 setGenes.insert(CVGenes[i]); 00879 } 00880 00881 for (i = 0; i < PCL.GetGenes() - 1; i++) { 00882 if (setGenes.find(PCL.GetGene(i)) == setGenes.end()) { 00883 iDoc = 0; 00884 for (j = i + 1; j < PCL.GetGenes() - 1; j++) { 00885 if (setGenes.find(PCL.GetGene(j)) == setGenes.end()) { 00886 iDoc++; 00887 vec_pDoc.push_back(CreateDoc(PCL, i, j, iDoc - 1)); 00888 vecPairIndex.push_back(pair<size_t, size_t> (i, j)); 00889 } 00890 } 00891 DOC** ppDoc; 00892 ppDoc = new DOC*[vec_pDoc.size()]; 00893 copy(vec_pDoc.begin(), vec_pDoc.end(), ppDoc); 00894 vec_pDoc.clear(); 00895 PATTERN pattern; 00896 pattern.doc = ppDoc; 00897 pattern.totdoc = iDoc; 00898 00899 LABEL label = classify_struct_example(pattern, &structmodel, 00900 &struct_parm); 00901 pair<size_t, size_t> tmpPair; 00902 float d; 00903 for (k = 0; k < vecPairIndex.size(); k++) { 00904 tmpPair = vecPairIndex[k]; 00905 if (!Sleipnir::CMeta::IsNaN(d = Values.Get(tmpPair.first, 00906 tmpPair.second))) 00907 Values.Set(tmpPair.first, tmpPair.second, d 00908 + label.Class[k]); 00909 else 00910 Values.Set(tmpPair.first, tmpPair.second, label.Class[k]); 00911 if (!Sleipnir::CMeta::IsNaN(d = Counts.Get(tmpPair.first, 00912 tmpPair.second))) 00913 Counts.Set(tmpPair.first, tmpPair.second, d + 1); 00914 else 00915 Counts.Set(tmpPair.first, tmpPair.second, 1); 00916 } 00917 vecPairIndex.resize(0); 00918 FreePattern(pattern); 00919 } 00920 } 00921 } 00922 00923 void CSVMPERF::ClassifyAll(Sleipnir::CPCL& PCL, Sleipnir::CDat& Values, 00924 const vector<string>& CVGenes) { 00925 size_t iGene, jGene, i, j, k; 00926 string strGeneOne, strGeneTwo; 00927 set<string> setGenes; 00928 set<string>::iterator iterSet; 00929 vector<DOC*> vec_pDoc; 00930 vector<pair<size_t, size_t> > vecPairIndex; 00931 size_t iDoc; 00932 for (i = 0; i < CVGenes.size(); i++) { 00933 setGenes.insert(CVGenes[i]); 00934 } 00935 00936 for (i = 0; i < PCL.GetGenes() - 1; i++) { 00937 if (setGenes.find(PCL.GetGene(i)) == setGenes.end()) { 00938 iDoc = 0; 00939 for (j = i + 1; j < PCL.GetGenes() - 1; j++) { 00940 if (setGenes.find(PCL.GetGene(j)) == setGenes.end()) { 00941 iDoc++; 00942 vec_pDoc.push_back(CreateDoc(PCL, i, j, iDoc - 1)); 00943 vecPairIndex.push_back(pair<size_t, size_t> (i, j)); 00944 } 00945 } 00946 DOC** ppDoc; 00947 ppDoc = new DOC*[vec_pDoc.size()]; 00948 copy(vec_pDoc.begin(), vec_pDoc.end(), ppDoc); 00949 vec_pDoc.clear(); 00950 PATTERN pattern; 00951 pattern.doc = ppDoc; 00952 pattern.totdoc = iDoc; 00953 00954 LABEL label = classify_struct_example(pattern, &structmodel, 00955 &struct_parm); 00956 pair<size_t, size_t> tmpPair; 00957 float d; 00958 for (k = 0; k < vecPairIndex.size(); k++) { 00959 tmpPair = vecPairIndex[k]; 00960 Values.Set(tmpPair.first, tmpPair.second, d + label.Class[k]); 00961 } 00962 vecPairIndex.resize(0); 00963 FreePattern(pattern); 00964 } 00965 } 00966 } 00967 00968 00969 // populate docs for each label gene pair from a vector of dat file names 00970 bool CSVMPERF::CreateDoc(vector<string>& vecstrDatasets, 00971 vector<SVMLabelPair*>& vecLabels, 00972 const vector<string>& LabelsGene, 00973 Sleipnir::CDat::ENormalize eNormalize){ 00974 00975 size_t i, j, k, iGene, jGene, iDoc, iWord, iWords; 00976 float d; 00977 vector<size_t> veciGene; 00978 vector<WORD*> vec_pWord; 00979 vector<size_t> labelg2dat; 00980 00981 WORD* aWords; 00982 DOC* pRet; 00983 Sleipnir::CDat Dat; 00984 00985 iWords = vecstrDatasets.size(); 00986 00987 vec_pWord.reserve(vecLabels.size()); 00988 00989 // initialize all the WORDs 00990 for(i=0; i < vecLabels.size(); i++){ 00991 aWords = new WORD[iWords + 1]; 00992 00993 // set wnum values 00994 for (k = 0; k < iWords; ++k) { 00995 // cout<<i<<endl; 00996 aWords[k].wnum = k + 1; 00997 // asWords[ i ].wnum = 0; 00998 } 00999 aWords[k].wnum = 0; 01000 vec_pWord[i] = aWords; 01001 } 01002 01003 // initialize the gene mappings 01004 labelg2dat.resize(LabelsGene.size()); 01005 01006 // now open up all datasets 01007 for(i=0; i < vecstrDatasets.size(); i++){ 01008 if(!Dat.Open(vecstrDatasets[i].c_str())) { 01009 cerr << vecstrDatasets[i].c_str() << endl; 01010 cerr << "Could not open: " << vecstrDatasets[i] << endl; 01011 return false; 01012 } 01013 01014 // normalize dat file 01015 if( eNormalize != Sleipnir::CDat::ENormalizeNone ){ 01016 cerr << "Normalize input data" << endl; 01017 Dat.Normalize( eNormalize ); 01018 } 01019 01020 cerr << "Open: " << vecstrDatasets[i] << endl; 01021 01022 // construct gene name mapping 01023 for(k=0; k < LabelsGene.size(); k++){ 01024 labelg2dat[k] = Dat.GetGene(LabelsGene[k]); 01025 } 01027 01028 for (j = 0; j < vecLabels.size(); j++) { 01029 aWords = vec_pWord[j]; 01030 01031 if( ((iGene = labelg2dat[vecLabels[j]->iidx]) == -1 ) || 01032 ((jGene = labelg2dat[vecLabels[j]->jidx]) == -1 )){ 01033 aWords[i].weight = 0; 01034 continue; 01035 } 01036 01037 if (!Sleipnir::CMeta::IsNaN(d = Dat.Get(iGene, jGene))) { 01038 aWords[i].weight = d; 01039 } else 01040 aWords[i].weight = 0; 01041 } 01042 } 01043 01044 // now create a Doc per label 01045 for (j = 0; j < vecLabels.size(); j++) { 01046 //pRet->fvec->words[0].weight; 01047 aWords = vec_pWord[j]; 01048 pRet = create_example(j, 0, 0, 1, create_svector(aWords, "", 1)); 01049 vecLabels[j]->pDoc = pRet; 01050 delete[] aWords; 01051 } 01052 01053 return true; 01054 } 01055 01056 SAMPLE* CSVMPERF::CreateSample(vector<SVMLabelPair*>& SVMLabels) { 01057 size_t i, j, iGene, iDoc; 01058 vector<DOC*> vec_pDoc; 01059 vector<double> vecClass; 01060 iDoc = 0; 01061 for (i = 0; i < SVMLabels.size(); i++) { 01062 iDoc++; 01063 vec_pDoc.push_back(SVMLabels[i]->pDoc); 01064 vecClass.push_back(SVMLabels[i]->Target); 01065 } 01066 01067 DOC** ppDoc; 01068 ppDoc = new DOC*[vec_pDoc.size()]; 01069 copy(vec_pDoc.begin(), vec_pDoc.end(), ppDoc); 01070 vec_pDoc.clear(); 01071 PATTERN* pPattern = new PATTERN; 01072 pPattern->doc = ppDoc; 01073 01074 pPattern->totdoc = iDoc; 01075 // cout << "number of document=" << pPattern->totdoc << endl; 01076 LABEL* pLabel = new LABEL; 01077 double* aClass; 01078 aClass = new double[vecClass.size()]; 01079 copy(vecClass.begin(), vecClass.end(), aClass); 01080 vecClass.clear(); 01081 pLabel->Class = aClass; 01082 pLabel->totdoc = iDoc; 01083 01084 EXAMPLE* aExample; 01085 aExample = new EXAMPLE[1]; 01086 //cout<<"aExample @"<<aExample<<endl; 01087 aExample[0].x = *pPattern; 01088 aExample[0].y = *pLabel; 01089 SAMPLE* pSample = new SAMPLE; 01090 pSample->n = 1; 01091 pSample->examples = aExample; 01092 /* cout << "examples @" << pSample->examples << endl; 01093 cout<< "ppDoc="<<ppDoc<<endl; 01094 cout << "docs @" << pSample->examples[0].x.doc << endl; 01095 cout<<"done creating sample"<<endl; 01096 cout<<"sample @ "<<pSample<<endl;*/ 01097 return pSample; 01098 } 01099 01100 void CSVMPERF::Classify(Sleipnir::CDat &Results, 01101 vector<SVMLabelPair*>& SVMLabels) { 01102 size_t i, iGene, iDoc; 01103 iDoc = 0; 01104 DOC** ppDoc; 01105 ppDoc = new DOC*[1]; 01106 PATTERN pattern; 01107 pattern.doc = ppDoc; 01108 pattern.totdoc = 1; 01109 //cerr << "CLASSIFY classifying " << endl; 01110 LABEL label; 01111 for (i = 0; i < SVMLabels.size(); i++) { 01112 ppDoc[0] = SVMLabels[i]->pDoc; 01113 label 01114 = classify_struct_example(pattern, &structmodel, 01115 &struct_parm); 01116 01117 Results.Set(SVMLabels[i]->iidx, SVMLabels[i]->jidx, label.Class[0]); 01118 } 01119 01120 delete ppDoc; 01121 } 01122 01123 void CSVMPERF::FreeSample_leave_Doc(SAMPLE s){ 01124 /* Frees the memory of sample s. */ 01125 int i; 01126 for(i=0;i<s.n;i++) { 01127 free(s.examples[i].x.doc); 01128 free_label(s.examples[i].y); 01129 } 01130 free(s.examples); 01131 } 01132 01133 // Platt's binary SVM Probablistic Output 01134 // Assume dec_values and labels have same dimensions and genes 01135 void CSVMPERF::sigmoid_train(Sleipnir::CDat& Results, 01136 vector<SVMLabelPair*>& SVMLabels, 01137 float& A, float& B){ 01138 double prior1=0, prior0 = 0; 01139 size_t i, j, idx; 01140 float d, lab; 01141 01142 int max_iter=100; // Maximal number of iterations 01143 double min_step=1e-10; // Minimal step taken in line search 01144 double sigma=1e-12; // For numerically strict PD of Hessian 01145 double eps=1e-5; 01146 vector<double> t; 01147 double fApB,p,q,h11,h22,h21,g1,g2,det,dA,dB,gd,stepsize; 01148 double newA,newB,newf,d1,d2; 01149 int iter; 01150 vector<float> dec_values; 01151 01152 // Negatives are values less than 0 01153 for(i = 0; i < SVMLabels.size(); i++) 01154 if( SVMLabels[i]->Target > 0 ) 01155 prior1 += 1; 01156 else if(SVMLabels[i]->Target < 0 ) 01157 prior0 += 1; 01158 01159 // initialize size 01160 t.resize(prior0+prior1); 01161 01162 01163 // classify training examples 01164 LABEL label; 01165 size_t iGene, iDoc; 01166 iDoc = 0; 01167 DOC** ppDoc; 01168 ppDoc = new DOC*[1]; 01169 PATTERN pattern; 01170 pattern.doc = ppDoc; 01171 pattern.totdoc = 1; 01172 01173 dec_values.resize(t.size()); 01174 01175 idx = 0; 01176 for (i = 0; i < SVMLabels.size(); i++) { 01177 if( SVMLabels[i]->Target == 0 ) 01178 continue; 01179 if( Sleipnir::CMeta::IsNaN(d = Results.Get(SVMLabels[i]->iidx, SVMLabels[i]->jidx))) 01180 continue; 01181 01182 dec_values[idx] = d; 01183 idx++; 01184 } 01185 01186 // Initial Point and Initial Fun Value 01187 A=0.0; B=log((prior0+1.0)/(prior1+1.0)); 01188 double hiTarget=(prior1+1.0)/(prior1+2.0); 01189 double loTarget=1/(prior0+2.0); 01190 double fval = 0.0; 01191 01192 idx = 0; 01193 for(i = 0; i < SVMLabels.size(); i++){ 01194 if ( SVMLabels[i]->Target > 0) t[idx]=hiTarget; 01195 else if( SVMLabels[i]->Target < 0 ) t[idx]=loTarget; 01196 else 01197 continue; 01198 01199 fApB = dec_values[idx]*A+B; 01200 if (fApB>=0) 01201 fval += t[idx]*fApB + log(1+exp(-fApB)); 01202 else 01203 fval += (t[idx] - 1)*fApB +log(1+exp(fApB)); 01204 01205 idx++; 01206 } 01207 01208 for (iter=0;iter<max_iter;iter++) 01209 { 01210 // Update Gradient and Hessian (use H' = H + sigma I) 01211 h11=sigma; // numerically ensures strict PD 01212 h22=sigma; 01213 h21=0.0;g1=0.0;g2=0.0; 01214 for (i=0; i < dec_values.size(); i++) 01215 { 01216 fApB = dec_values[i]*A+B; 01217 if (fApB >= 0) 01218 { 01219 p=exp(-fApB)/(1.0+exp(-fApB)); 01220 q=1.0/(1.0+exp(-fApB)); 01221 } 01222 else 01223 { 01224 p=1.0/(1.0+exp(fApB)); 01225 q=exp(fApB)/(1.0+exp(fApB)); 01226 } 01227 d2=p*q; 01228 h11+=dec_values[i]*dec_values[i]*d2; 01229 h22+=d2; 01230 h21+=dec_values[i]*d2; 01231 d1=t[i]-p; 01232 g1+=dec_values[i]*d1; 01233 g2+=d1; 01234 } 01235 01236 // Stopping Criteria 01237 if (fabs(g1)<eps && fabs(g2)<eps) 01238 break; 01239 01240 // Finding Newton direction: -inv(H') * g 01241 det=h11*h22-h21*h21; 01242 dA=-(h22*g1 - h21 * g2) / det; 01243 dB=-(-h21*g1+ h11 * g2) / det; 01244 gd=g1*dA+g2*dB; 01245 01246 01247 stepsize = 1; // Line Search 01248 while (stepsize >= min_step) 01249 { 01250 newA = A + stepsize * dA; 01251 newB = B + stepsize * dB; 01252 01253 // New function value 01254 newf = 0.0; 01255 for (i=0; i < dec_values.size(); i++) 01256 { 01257 fApB = dec_values[i]*newA+newB; 01258 if (fApB >= 0) 01259 newf += t[i]*fApB + log(1+exp(-fApB)); 01260 else 01261 newf += (t[i] - 1)*fApB +log(1+exp(fApB)); 01262 } 01263 // Check sufficient decrease 01264 if (newf<fval+0.0001*stepsize*gd) 01265 { 01266 A=newA;B=newB;fval=newf; 01267 break; 01268 } 01269 else 01270 stepsize = stepsize / 2.0; 01271 } 01272 01273 if (stepsize < min_step) 01274 { 01275 cerr << "Line search fails in two-class probability estimates\n"; 01276 break; 01277 } 01278 } 01279 01280 if (iter>=max_iter) 01281 cerr << "Reaching maximal iterations in two-class probability estimates\n"; 01282 } 01283 01284 void CSVMPERF::sigmoid_predict(Sleipnir::CDat& Results, vector<SVMLabelPair*>& SVMLabels, float A, float B){ 01285 size_t i; 01286 float d, fApB; 01287 01288 for(i = 0; i < SVMLabels.size(); i++){ 01289 if (!Sleipnir::CMeta::IsNaN(d = Results.Get(SVMLabels[i]->iidx, SVMLabels[i]->jidx))){ 01290 fApB = d*A+B; 01291 // 1-p used later; avoid catastrophic cancellation 01292 if (fApB >= 0) 01293 Results.Set(SVMLabels[i]->iidx, SVMLabels[i]->jidx, exp(-fApB)/(1.0+exp(-fApB))); 01294 else 01295 Results.Set(SVMLabels[i]->iidx, SVMLabels[i]->jidx, 1.0/(1+exp(fApB))); 01296 } 01297 } 01298 } 01299 01300 STRUCTMODEL CSVMPERF::read_struct_model_w_linear(char *file, STRUCT_LEARN_PARM *sparm){ 01301 STRUCTMODEL sm; 01302 MODEL *model; 01303 01304 model = (MODEL *)my_malloc(sizeof(MODEL)); 01305 model->supvec = (DOC **)my_malloc(sizeof(DOC *)*2); 01306 model->alpha = (double *)my_malloc(sizeof(double)*2); 01307 model->index = NULL; /* index is not copied */ 01308 model->supvec[0] = NULL; 01309 model->alpha[0] = 0.0; 01310 model->alpha[1] = 1.0; 01311 model->sv_num=2; 01312 model->b = 0; 01313 model->totwords = 0; 01314 01315 // read W model 01316 static const size_t c_iBuffer = 1024; 01317 char acBuffer[c_iBuffer]; 01318 char* nameBuffer; 01319 vector<string> vecstrTokens; 01320 size_t i, extPlace; 01321 string Ext, FileName; 01322 size_t index = 0; 01323 ifstream ifsm; 01324 vector<float> SModel; 01325 01326 ifsm.open(file); 01327 while (!ifsm.eof()) { 01328 ifsm.getline(acBuffer, c_iBuffer - 1); 01329 acBuffer[c_iBuffer - 1] = 0; 01330 vecstrTokens.clear(); 01331 Sleipnir::CMeta::Tokenize(acBuffer, vecstrTokens); 01332 if (vecstrTokens.empty()) 01333 continue; 01334 if (vecstrTokens.size() > 1) { 01335 cerr << "Illegal model line (" << vecstrTokens.size() << "): " 01336 << acBuffer << endl; 01337 continue; 01338 } 01339 if (acBuffer[0] == '#') { 01340 cerr << "skipping " << acBuffer << endl; 01341 } else { 01342 SModel.push_back(atof(vecstrTokens[0].c_str())); 01343 } 01344 } 01345 01346 model->totwords = SModel.size(); 01347 model->lin_weights=(double *)my_malloc(sizeof(double)*(model->totwords+1)); 01348 model->kernel_parm.kernel_type = LINEAR; 01349 01350 for(i = 0; i < (model->totwords+1); i++){ 01351 if(i == 0) 01352 model->lin_weights[i] = 0; 01353 else 01354 model->lin_weights[i] = SModel[i-1]; 01355 } 01356 01357 model->supvec[1] = create_example(-1,0,0,0, 01358 create_svector_n(model->lin_weights, 01359 model->totwords, 01360 NULL,1.0)); 01361 01362 sm.svm_model=model; 01363 01364 sparm->loss_function=ERRORRATE; 01365 sparm->bias=0; 01366 sparm->bias_featurenum=0; 01367 sparm->num_features=sm.svm_model->totwords; 01368 sparm->truncate_fvec=(sm.svm_model->kernel_parm.kernel_type==LINEAR); 01369 01370 if(sm.svm_model->kernel_parm.kernel_type == CUSTOM) /* double kernel */ 01371 sparm->preimage_method=9; 01372 01373 sm.invL=NULL; 01374 sm.expansion=NULL; 01375 sm.expansion_size=0; 01376 sm.sparse_kernel_type=0; 01377 sm.w=sm.svm_model->lin_weights; 01378 sm.sizePsi=sm.svm_model->totwords; 01379 01380 if((sm.svm_model->kernel_parm.kernel_type!=LINEAR) && sparm->classify_dense) 01381 add_dense_vectors_to_model(sm.svm_model); 01382 return(sm); 01383 } 01384 01385 } 01386