Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "svmstruct.h" 00024 #include "pclset.h" 00025 #include "dataset.h" 00026 #include "meta.h" 00027 #include "genome.h" 00028 #include "compactmatrix.h" 00029 #include <vector> 00030 #include <set> 00031 00032 #define SLACK_RESCALING 1 00033 #define MARGIN_RESCALING 2 00034 00035 namespace SVMArc { 00036 extern "C" { 00037 // void free_struct_model(STRUCTMODEL sm); 00038 void free_struct_sample(SAMPLE s); 00039 // void svm_learn_struct_joint_custom(SAMPLE sample, 00040 // STRUCT_LEARN_PARM *sparm, 00041 // LEARN_PARM *lparm, KERNEL_PARM *kparm, 00042 // STRUCTMODEL *sm); 00043 // SAMPLE read_struct_examples_sleipnir(DOC **all_docs, double*all_labels, int example_size, int total_features, STRUCT_LEARN_PARM *sparm); 00044 // void free_struct_model(STRUCTMODEL sm); 00045 // void free_struct_sample(SAMPLE s); 00046 // void set_struct_verbosity(long verb); 00047 // double estimate_r_delta_average(DOC **, long, KERNEL_PARM *); 00048 // MODEL *read_model(char *); 00049 LABEL classify_struct_example(PATTERN x, STRUCTMODEL *sm, 00050 STRUCT_LEARN_PARM *sparm); 00051 DOC* create_example(long, long, long, double, SVECTOR *); 00052 SVECTOR * create_svector(WORD *, char *, double); 00053 void set_struct_verbosity(long verb); 00054 00055 } 00056 00057 void CSVMSTRUCTMC::SetVerbosity(size_t V) { 00058 struct_verbosity = (long) V; 00059 } 00060 00061 bool CSVMSTRUCTMC::initialize() { 00062 00063 //set directionality 00064 00065 00066 /* set default */ 00067 Alg = DEFAULT_ALG_TYPE; 00068 //Learn_parms 00069 struct_parm.C=0.01; 00070 struct_parm.slack_norm=1; 00071 struct_parm.epsilon=DEFAULT_EPS; 00072 struct_parm.custom_argc=0; 00073 struct_parm.loss_function=DEFAULT_LOSS_FCT; 00074 struct_parm.loss_type=DEFAULT_RESCALING; 00075 struct_parm.newconstretrain=100; 00076 struct_parm.ccache_size=5; 00077 struct_parm.batch_size=100; 00078 //Learn_parms 00079 //strcpy (learn_parm.predfile, "trans_predictions"); 00080 strcpy(learn_parm.alphafile, ""); 00081 //verbosity=0;/*verbosity for svm_light*/ 00082 //struct_verbosity = 1; /*verbosity for struct learning portion*/ 00083 learn_parm.biased_hyperplane=1; 00084 learn_parm.remove_inconsistent=0; 00085 learn_parm.skip_final_opt_check=0; 00086 learn_parm.svm_maxqpsize=10; 00087 learn_parm.svm_newvarsinqp=0; 00088 learn_parm.svm_iter_to_shrink=-9999; 00089 learn_parm.maxiter=100000; 00090 learn_parm.kernel_cache_size=40; 00091 learn_parm.svm_c=99999999; /* overridden by struct_parm.C */ 00092 learn_parm.eps=0.001; /* overridden by struct_parm.epsilon */ 00093 learn_parm.transduction_posratio=-1.0; 00094 learn_parm.svm_costratio=1.0; 00095 learn_parm.svm_costratio_unlab=1.0; 00096 learn_parm.svm_unlabbound=1E-5; 00097 learn_parm.epsilon_crit=0.001; 00098 learn_parm.epsilon_a=1E-10; /* changed from 1e-15 */ 00099 learn_parm.compute_loo=0; 00100 learn_parm.rho=1.0; 00101 learn_parm.xa_depth=0; 00102 kernel_parm.kernel_type=0; 00103 kernel_parm.poly_degree=3; 00104 kernel_parm.rbf_gamma=1.0; 00105 kernel_parm.coef_lin=1; 00106 kernel_parm.coef_const=1; 00107 strcpy(kernel_parm.custom, "empty"); 00108 00109 if (learn_parm.svm_iter_to_shrink == -9999) { 00110 learn_parm.svm_iter_to_shrink = 100; 00111 } 00112 00113 if ((learn_parm.skip_final_opt_check) 00114 && (kernel_parm.kernel_type == LINEAR)) { 00115 printf( 00116 "\nIt does not make sense to skip the final optimality check for linear kernels.\n\n"); 00117 learn_parm.skip_final_opt_check = 0; 00118 } 00119 00120 //struct parms 00121 00122 /* set number of features to -1, indicating that it will be computed 00123 in init_struct_model() */ 00124 struct_parm.num_features = -1; 00125 00126 return true; 00127 } 00128 00129 bool CSVMSTRUCTMC::parms_check() { 00130 if ((learn_parm.skip_final_opt_check) && (learn_parm.remove_inconsistent)) { 00131 fprintf( 00132 stderr, 00133 "\nIt is necessary to do the final optimality check when removing inconsistent \nexamples.\n"); 00134 return false; 00135 } 00136 if ((learn_parm.svm_maxqpsize < 2)) { 00137 fprintf( 00138 stderr, 00139 "\nMaximum size of QP-subproblems not in valid range: %ld [2..]\n", 00140 learn_parm.svm_maxqpsize); 00141 return false; 00142 } 00143 if ((learn_parm.svm_maxqpsize < learn_parm.svm_newvarsinqp)) { 00144 fprintf( 00145 stderr, 00146 "\nMaximum size of QP-subproblems [%ld] must be larger than the number of\n", 00147 learn_parm.svm_maxqpsize); 00148 fprintf( 00149 stderr, 00150 "new variables [%ld] entering the working set in each iteration.\n", 00151 learn_parm.svm_newvarsinqp); 00152 return false; 00153 } 00154 if (learn_parm.svm_iter_to_shrink < 1) { 00155 fprintf( 00156 stderr, 00157 "\nMaximum number of iterations for shrinking not in valid range: %ld [1,..]\n", 00158 learn_parm.svm_iter_to_shrink); 00159 return false; 00160 } 00161 if (struct_parm.C < 0) { 00162 fprintf( 00163 stderr, 00164 "\nTrade-off between training error and margin is not set (C<0)!\nC value will be set to default value. Clight = Cpef * 100 / n \n"); 00165 } 00166 if (learn_parm.transduction_posratio > 1) { 00167 fprintf(stderr, 00168 "\nThe fraction of unlabeled examples to classify as positives must\n"); 00169 fprintf(stderr, "be less than 1.0 !!!\n\n"); 00170 return false; 00171 } 00172 if (learn_parm.svm_costratio <= 0) { 00173 fprintf(stderr, 00174 "\nThe COSTRATIO parameter must be greater than zero!\n\n"); 00175 return false; 00176 } 00177 if (struct_parm.epsilon <= 0) { 00178 fprintf(stderr, 00179 "\nThe epsilon parameter must be greater than zero!\n\n"); 00180 return false; 00181 } 00182 if ((struct_parm.slack_norm < 1) || (struct_parm.slack_norm > 2)) { 00183 fprintf(stderr, 00184 "\nThe norm of the slacks must be either 1 (L1-norm) or 2 (L2-norm)!\n\n"); 00185 return false; 00186 } 00187 00188 if ((struct_parm.loss_type != SLACK_RESCALING) && (struct_parm.loss_type 00189 != MARGIN_RESCALING)) { 00190 fprintf( 00191 stderr, 00192 "\nThe loss type must be either 1 (slack rescaling) or 2 (margin rescaling)!\n\n"); 00193 return false; 00194 } 00195 00196 if (learn_parm.rho < 0) { 00197 fprintf(stderr, 00198 "\nThe parameter rho for xi/alpha-estimates and leave-one-out pruning must\n"); 00199 fprintf(stderr, 00200 "be greater than zero (typically 1.0 or 2.0, see T. Joachims, Estimating the\n"); 00201 fprintf(stderr, 00202 "Generalization Performance of an SVM Efficiently, ICML, 2000.)!\n\n"); 00203 return false; 00204 } 00205 if ((learn_parm.xa_depth < 0) || (learn_parm.xa_depth > 100)) { 00206 fprintf(stderr, 00207 "\nThe parameter depth for ext. xi/alpha-estimates must be in [0..100] (zero\n"); 00208 fprintf(stderr, 00209 "for switching to the conventional xa/estimates described in T. Joachims,\n"); 00210 fprintf( 00211 stderr, 00212 "Estimating the Generalization Performance of an SVM Efficiently, ICML, 2000.)\n"); 00213 return false; 00214 } 00215 00216 00217 00218 return true; 00219 } 00220 00221 00222 00223 DOC* CSVMSTRUCTMC::CreateDoc(Sleipnir::CPCL &PCL, size_t iGene, size_t iDoc) { 00224 WORD* aWords; 00225 size_t i, j, iWord, iWords, iPCL, iExp; 00226 float d; 00227 DOC* pRet; 00228 pRet->fvec->words[0].weight; 00229 //get number of features 00230 iWords = PCL.GetExperiments(); 00231 //cerr<<"Newing WORDS "<<(iWords+1)*sizeof(WORD)<<endl; 00232 aWords = new WORD[iWords + 1]; 00233 //set the words 00234 for (i = 0; i < iWords; ++i) { 00235 aWords[i].wnum = i + 1; 00236 if (!Sleipnir::CMeta::IsNaN(d = PCL.Get(iGene, i))) 00237 aWords[i].weight = d; 00238 else 00239 aWords[i].weight = 0; 00240 } 00241 aWords[i].wnum = 0; 00242 // cerr<<"START Create Example"<<endl; 00243 pRet = create_example(iDoc, 0, 0, 1, create_svector(aWords, "", 1)); 00244 //cerr<<"END create example"<<endl; 00245 delete[] aWords; 00246 return pRet; 00247 } 00248 00249 00250 vector<SVMLabel> CSVMSTRUCTMC::ReadLabels(ifstream & ifsm) { 00251 00252 static const size_t c_iBuffer = 1024; 00253 char acBuffer[c_iBuffer]; 00254 vector<string> vecstrTokens; 00255 vector<SVMLabel> vecLabels; 00256 size_t numPositives, numNegatives; 00257 numPositives = numNegatives = 0; 00258 while (!ifsm.eof()) { 00259 ifsm.getline(acBuffer, c_iBuffer - 1); 00260 acBuffer[c_iBuffer - 1] = 0; 00261 vecstrTokens.clear(); 00262 CMeta::Tokenize(acBuffer, vecstrTokens); 00263 if (vecstrTokens.empty()) 00264 continue; 00265 if (vecstrTokens.size() != 2) { 00266 cerr << "Illegal label line (" << vecstrTokens.size() << "): " 00267 << acBuffer << endl; 00268 continue; 00269 } 00270 vecLabels.push_back(SVMArc::SVMLabel(vecstrTokens[0], atoi( 00271 vecstrTokens[1].c_str()))); 00272 if (vecLabels.back().Target > 0) 00273 numPositives++; 00274 else 00275 numNegatives++; 00276 } 00277 return vecLabels; 00278 } 00279 00280 00281 SAMPLE* CSVMSTRUCTMC::CreateSample(Sleipnir::CPCL &PCL, vector<SVMLabel> SVMLabels) { 00282 size_t i, j, iGene, iDoc; 00283 int n; /* number of examples */ 00284 int *target; 00285 long num_classes=0; 00286 SAMPLE* pSample = new SAMPLE; 00287 EXAMPLE* examples; 00288 DOC** docs; 00289 vector<DOC*> vec_pDoc; 00290 vec_pDoc.reserve(SVMLabels.size()); 00291 vector<int> vecClass; 00292 vecClass.reserve(SVMLabels.size()); 00293 00294 iDoc = 0; 00295 float numPositives, numNegatives; 00296 numPositives = numNegatives = 0; 00297 for (i = 0; i < SVMLabels.size(); i++) { 00298 // cout<< "processing gene " << SVMLabels[i].GeneName << endl; 00299 if (!SVMLabels[i].hasIndex) { 00300 SVMLabels[i].SetIndex(PCL.GetGene(SVMLabels[i].GeneName)); 00301 } 00302 iGene = SVMLabels[i].index; 00303 // cout << SVMLabels[i].GeneName<<" gene at location "<<iGene << endl; 00304 if (iGene != -1) { 00305 // cout << "creating doc" << endl; 00306 iDoc++; 00307 vec_pDoc.push_back(CreateDoc(PCL, iGene, iDoc - 1)); 00308 vecClass.push_back(SVMLabels[i].Target); 00309 } 00310 } 00311 00312 //copy patterns and labels to new vector 00313 docs = new DOC*[vec_pDoc.size()]; 00314 n = vec_pDoc.size(); 00315 //cout << "Read in " << n << "Standards"<<endl; 00316 copy(vec_pDoc.begin(), vec_pDoc.end(), docs); 00317 vec_pDoc.clear(); 00318 00319 //cerr << "NEW Class array" << endl; 00320 target = new int[vecClass.size()]; 00321 copy(vecClass.begin(), vecClass.end(), target); 00322 vecClass.clear(); 00323 00324 00325 00326 examples=(EXAMPLE *)my_malloc(sizeof(EXAMPLE)*n); 00327 for(i=0;i<n;i++) /* find highest class label */ 00328 if(num_classes < target[i]) 00329 num_classes=target[i]; 00330 00331 for(i=0;i<n;i++) /* make sure all class labels are positive */ 00332 if(target[i]<1) { 00333 printf("\nERROR: The class label '%d' of example number %ld is not greater than '1'!\n",target[i],i+1); 00334 exit(1); 00335 } 00336 for(i=0;i<n;i++) { /* copy docs over into new datastructure */ 00337 examples[i].x.doc=docs[i]; 00338 examples[i].y.Class=target[i]; 00339 examples[i].y.scores=NULL; 00340 examples[i].y.num_classes=num_classes; 00341 } 00342 free(target); 00343 free(docs); 00344 pSample->n=n; 00345 pSample->examples=examples; 00346 00347 if(struct_verbosity>=0) 00348 printf(" (%d examples) ",pSample->n); 00349 00350 00351 00352 00353 return pSample; 00354 //cerr<<"DONE CreateSample"<<endl; 00355 } 00356 00357 //Single gene classification 00358 00359 vector<Result> CSVMSTRUCTMC::Classify(Sleipnir::CPCL &PCL, 00360 vector<SVMLabel> SVMLabels) { 00361 size_t i, j,k, iGene, iDoc; 00362 vector<int> vecClass; 00363 vector<Result> vecResult; 00364 iDoc = 0; 00365 PATTERN pattern; 00366 pattern.totdoc = 1; 00367 cerr << "CLASSIFY classifying " << endl; 00368 LABEL label; 00369 for (i = 0; i < SVMLabels.size(); i++) { 00370 if (!SVMLabels[i].hasIndex) { 00371 SVMLabels[i].SetIndex(PCL.GetGene(SVMLabels[i].GeneName)); 00372 } 00373 iGene = SVMLabels[i].index; 00374 //cout << "CLASS gene=" << iGene << endl; 00375 if (iGene != -1) { 00376 iDoc++; 00377 00378 //cout << "CLASS iDOC=" << iDoc << endl; 00379 pattern.doc = CreateDoc(PCL, iGene, iDoc); 00380 //cerr<<"Doc Created"<<endl; 00381 label = classify_struct_example(pattern, &structmodel, 00382 &struct_parm); 00383 //cerr<<"CLASSIED"<<endl; 00384 vecClass.push_back(SVMLabels[i].Target); 00385 vecResult.resize(iDoc); 00386 vecResult[iDoc - 1].GeneName = SVMLabels[i].GeneName; 00387 vecResult[iDoc - 1].Target = SVMLabels[i].Target; 00388 vecResult[iDoc - 1].Value = label.Class; 00389 vecResult[iDoc - 1].num_class=struct_parm.num_classes; 00390 //vecResult[iDoc - 1].Scores.reserve(label.num_classes); 00391 for (k = 1; k <= struct_parm.num_classes; k++) 00392 vecResult[iDoc - 1].Scores.push_back(label.scores[k]); 00393 //cerr<<"CLASSIFY Called FreeDoc"<<endl; 00394 FreeDoc(pattern.doc); 00395 //cerr<<"CLASSIFY End FreeDoc"<<endl; 00396 } 00397 } 00398 00399 return vecResult; 00400 } 00401 00402 00403 void CSVMSTRUCTMC::FreeSample_leave_Doc(SAMPLE s){ 00404 /* Frees the memory of sample s. */ 00405 int i; 00406 for(i=0;i<s.n;i++) { 00407 free(s.examples[i].x.doc); 00408 free_label(s.examples[i].y); 00409 } 00410 free(s.examples); 00411 } 00412 00413 00414 00415 } 00416