Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "svmstructtree.h" 00024 #include "pclset.h" 00025 #include "dataset.h" 00026 #include "meta.h" 00027 #include "genome.h" 00028 #include "compactmatrix.h" 00029 #include <vector> 00030 #include <set> 00031 00032 #define SLACK_RESCALING 1 00033 #define MARGIN_RESCALING 2 00034 00035 namespace SVMArc { 00036 //extern "C" { 00037 // // void free_struct_model(STRUCTMODEL sm); 00038 // void free_struct_sample(SAMPLE s); 00039 // // void svm_learn_struct_joint_custom(SAMPLE sample, 00040 // // STRUCT_LEARN_PARM *sparm, 00041 // // LEARN_PARM *lparm, KERNEL_PARM *kparm, 00042 // // STRUCTMODEL *sm); 00043 // // SAMPLE read_struct_examples_sleipnir(DOC **all_docs, double*all_labels, int example_size, int total_features, STRUCT_LEARN_PARM *sparm); 00044 // // void free_struct_model(STRUCTMODEL sm); 00045 // // void free_struct_sample(SAMPLE s); 00046 // // void set_struct_verbosity(long verb); 00047 // // double estimate_r_delta_average(DOC **, long, KERNEL_PARM *); 00048 // // MODEL *read_model(char *); 00049 // LABEL classify_struct_example(PATTERN x, STRUCTMODEL *sm, 00050 // STRUCT_LEARN_PARM *sparm); 00051 // DOC* create_example(long, long, long, double, SVECTOR *); 00052 // SVECTOR * create_svector(WORD *, char *, double); 00053 // void set_struct_verbosity(long verb); 00054 00055 //} 00056 00057 void CSVMSTRUCTTREE::SetVerbosity(size_t V) { 00058 struct_verbosity = (long) V; 00059 //if( struct_verbosity>1) 00060 // struct_verbosity=1; 00061 } 00062 00063 bool CSVMSTRUCTTREE::initialize() { 00064 00065 //set directionality 00066 00067 00068 /* set default */ 00069 Alg = DEFAULT_ALG_TYPE; 00070 //Learn_parms 00071 struct_parm.C=-0.01; 00072 struct_parm.slack_norm=1; 00073 struct_parm.epsilon=DEFAULT_EPS; 00074 struct_parm.custom_argc=0; 00075 struct_parm.loss_function=DEFAULT_LOSS_FCT; 00076 struct_parm.loss_type=DEFAULT_RESCALING; 00077 struct_parm.newconstretrain=100; 00078 struct_parm.ccache_size=5; 00079 struct_parm.batch_size=100; 00080 //Learn_parms 00081 //strcpy (learn_parm.predfile, "trans_predictions"); 00082 strcpy(learn_parm.alphafile, ""); 00083 //verbosity=0;/*verbosity for svm_light*/ 00084 //struct_verbosity = 1; /*verbosity for struct learning portion*/ 00085 learn_parm.biased_hyperplane=1; 00086 learn_parm.remove_inconsistent=0; 00087 learn_parm.skip_final_opt_check=0; 00088 learn_parm.svm_maxqpsize=10; 00089 learn_parm.svm_newvarsinqp=0; 00090 learn_parm.svm_iter_to_shrink=-9999; 00091 learn_parm.maxiter=100000; 00092 learn_parm.kernel_cache_size=40; 00093 learn_parm.svm_c=99999999; /* overridden by struct_parm.C */ 00094 learn_parm.eps=0.001; /* overridden by struct_parm.epsilon */ 00095 learn_parm.transduction_posratio=-1.0; 00096 learn_parm.svm_costratio=1.0; 00097 learn_parm.svm_costratio_unlab=1.0; 00098 learn_parm.svm_unlabbound=1E-5; 00099 learn_parm.epsilon_crit=0.001; 00100 learn_parm.epsilon_a=1E-10; /* changed from 1e-15 */ 00101 learn_parm.compute_loo=0; 00102 learn_parm.rho=1.0; 00103 learn_parm.xa_depth=0; 00104 kernel_parm.kernel_type=0; 00105 kernel_parm.poly_degree=3; 00106 kernel_parm.rbf_gamma=1.0; 00107 kernel_parm.coef_lin=1; 00108 kernel_parm.coef_const=1; 00109 strcpy(kernel_parm.custom, "empty"); 00110 00111 if (learn_parm.svm_iter_to_shrink == -9999) { 00112 learn_parm.svm_iter_to_shrink = 100; 00113 } 00114 00115 if ((learn_parm.skip_final_opt_check) 00116 && (kernel_parm.kernel_type == LINEAR)) { 00117 printf( 00118 "\nIt does not make sense to skip the final optimality check for linear kernels.\n\n"); 00119 learn_parm.skip_final_opt_check = 0; 00120 } 00121 00122 //struct parms 00123 00124 /* set number of features to -1, indicating that it will be computed 00125 in init_struct_model() */ 00126 struct_parm.num_features = -1; 00127 00128 return true; 00129 } 00130 00131 bool CSVMSTRUCTTREE::parms_check() { 00132 if ((learn_parm.skip_final_opt_check) && (learn_parm.remove_inconsistent)) { 00133 fprintf( 00134 stderr, 00135 "\nIt is necessary to do the final optimality check when removing inconsistent \nexamples.\n"); 00136 return false; 00137 } 00138 if ((learn_parm.svm_maxqpsize < 2)) { 00139 fprintf( 00140 stderr, 00141 "\nMaximum size of QP-subproblems not in valid range: %ld [2..]\n", 00142 learn_parm.svm_maxqpsize); 00143 return false; 00144 } 00145 if ((learn_parm.svm_maxqpsize < learn_parm.svm_newvarsinqp)) { 00146 fprintf( 00147 stderr, 00148 "\nMaximum size of QP-subproblems [%ld] must be larger than the number of\n", 00149 learn_parm.svm_maxqpsize); 00150 fprintf( 00151 stderr, 00152 "new variables [%ld] entering the working set in each iteration.\n", 00153 learn_parm.svm_newvarsinqp); 00154 return false; 00155 } 00156 if (learn_parm.svm_iter_to_shrink < 1) { 00157 fprintf( 00158 stderr, 00159 "\nMaximum number of iterations for shrinking not in valid range: %ld [1,..]\n", 00160 learn_parm.svm_iter_to_shrink); 00161 return false; 00162 } 00163 if (struct_parm.C < 0) { 00164 fprintf( 00165 stderr, 00166 "\nTrade-off between training error and margin is not set (C<0)!\nC value will be set to default value. Clight = Cpef * 100 / n \n"); 00167 } 00168 if (learn_parm.transduction_posratio > 1) { 00169 fprintf(stderr, 00170 "\nThe fraction of unlabeled examples to classify as positives must\n"); 00171 fprintf(stderr, "be less than 1.0 !!!\n\n"); 00172 return false; 00173 } 00174 if (learn_parm.svm_costratio <= 0) { 00175 fprintf(stderr, 00176 "\nThe COSTRATIO parameter must be greater than zero!\n\n"); 00177 return false; 00178 } 00179 if (struct_parm.epsilon <= 0) { 00180 fprintf(stderr, 00181 "\nThe epsilon parameter must be greater than zero!\n\n"); 00182 return false; 00183 } 00184 if ((struct_parm.slack_norm < 1) || (struct_parm.slack_norm > 2)) { 00185 fprintf(stderr, 00186 "\nThe norm of the slacks must be either 1 (L1-norm) or 2 (L2-norm)!\n\n"); 00187 return false; 00188 } 00189 00190 if (struct_parm.loss_type 00191 != MARGIN_RESCALING) { 00192 fprintf( 00193 stderr, 00194 "\nThe loss type must be margin rescaling!\n\n"); 00195 return false; 00196 } 00197 if (struct_parm.num_classes<2){ 00198 fprintf( 00199 stderr, 00200 "\nAt least two classes in label are required!\n\n"); 00201 return false; 00202 } 00203 //if (struct_parm.num_features<1){ 00204 // fprintf( 00205 // stderr, 00206 // "\nAt least one feature is required!\n\n"); 00207 // return false; 00208 //} 00209 if (learn_parm.rho < 0) { 00210 fprintf(stderr, 00211 "\nThe parameter rho for xi/alpha-estimates and leave-one-out pruning must\n"); 00212 fprintf(stderr, 00213 "be greater than zero (typically 1.0 or 2.0, see T. Joachims, Estimating the\n"); 00214 fprintf(stderr, 00215 "Generalization Performance of an SVM Efficiently, ICML, 2000.)!\n\n"); 00216 return false; 00217 } 00218 if ((learn_parm.xa_depth < 0) || (learn_parm.xa_depth > 100)) { 00219 fprintf(stderr, 00220 "\nThe parameter depth for ext. xi/alpha-estimates must be in [0..100] (zero\n"); 00221 fprintf(stderr, 00222 "for switching to the conventional xa/estimates described in T. Joachims,\n"); 00223 fprintf( 00224 stderr, 00225 "Estimating the Generalization Performance of an SVM Efficiently, ICML, 2000.)\n"); 00226 return false; 00227 } 00228 00229 00230 00231 return true; 00232 } 00233 00234 void CSVMSTRUCTTREE::ReadOntology(const char* treefile) { 00235 vector<ONTONODE*> nodes; 00236 vector<TEMPNODE> tempnodes; 00237 ONTONODE* newnode; 00238 TEMPNODE newtempnode; 00239 int currentindex=0; 00240 ifstream ifsm; 00241 ifsm.clear(); 00242 ifsm.open(treefile); 00243 if (!ifsm.is_open()){ 00244 cerr << "Could not read Onto file" << endl; 00245 exit(1); 00246 } 00247 static const size_t c_iBuffer = 1024; //change this if not enough 00248 char acBuffer[c_iBuffer]; 00249 vector<string> vecstrTokens; 00250 map<string,int>::iterator it; 00251 00252 00253 while (!ifsm.eof()) { 00254 00255 /*read in text file */ 00256 ifsm.getline(acBuffer, c_iBuffer - 1); 00257 acBuffer[c_iBuffer - 1] = 0; 00258 vecstrTokens.clear(); 00259 CMeta::Tokenize(acBuffer, vecstrTokens); 00260 00261 if (vecstrTokens.empty()) 00262 continue; 00263 if (vecstrTokens.size() < 2) { 00264 cerr << "Illegal line (" << vecstrTokens.size() << "): " 00265 << acBuffer << endl; 00266 continue; 00267 } 00268 00269 //construct tree; correctness check of file is not writen yet; 00270 //construct string to index mapping onto_map 00271 it= onto_map.find(vecstrTokens[0]); 00272 if(it == onto_map.end()){ 00273 currentindex = onto_map.size(); 00274 onto_map[vecstrTokens[0]]=currentindex; 00275 onto_map_rev[currentindex]=vecstrTokens[0]; 00276 newnode= (ONTONODE *)my_malloc(sizeof(ONTONODE)); 00277 cerr << "Read new Onto Term: "<< vecstrTokens[0]<<endl; 00278 nodes.push_back(newnode); 00279 tempnodes.push_back(newtempnode); 00280 //shall I add node name to tree structure? 00281 } 00282 for (int i=1; i < vecstrTokens.size();i++){ 00283 it= onto_map.find(vecstrTokens[i]); 00284 if(it == onto_map.end()) { 00285 currentindex = onto_map.size(); 00286 onto_map[vecstrTokens[i]]=currentindex; 00287 cerr << "Read new Onto Term: "<< vecstrTokens[i]<<endl; 00288 onto_map_rev[currentindex]=vecstrTokens[i]; 00289 newnode= (ONTONODE *)my_malloc(sizeof(ONTONODE)); 00290 nodes.push_back(newnode); 00291 tempnodes.push_back(newtempnode); 00292 } 00293 00294 nodes[onto_map[vecstrTokens[i]]]->parent = nodes[onto_map[vecstrTokens[0]]]; 00295 tempnodes[onto_map[vecstrTokens[0]]].children.insert( nodes[onto_map[vecstrTokens[i]]]); 00296 00297 } 00298 00299 00300 } 00301 ONTONODE** newchildren; 00302 00303 for(int i=0; i < nodes.size(); i++){ 00304 nodes[i]->n_children=tempnodes[i].children.size(); 00305 //copy children 00306 newchildren = (ONTONODE **)my_malloc(sizeof(ONTONODE*)*nodes[i]->n_children); 00307 copy(tempnodes[i].children.begin(),tempnodes[i].children.end(),newchildren); 00308 nodes[i]->children = newchildren; 00309 //fill in ontology struct parameters 00310 nodes[i]->index=i; //index 00311 nodes[i]->inputlabelCount = 0; 00312 if(nodes[i]->n_children==0) //isLeafnode 00313 nodes[i]->isLeafnode=1; 00314 else 00315 nodes[i]->isLeafnode=0; 00316 nodes[i]->weight = 1; 00317 } 00318 00319 //copy all nodes to a C type array 00320 ONTONODE** allnewnodes = (ONTONODE **)my_malloc(sizeof(ONTONODE*)*nodes.size()); 00321 copy(nodes.begin(),nodes.end(),allnewnodes); 00322 00323 /*pass the tree to struct_parm*/ 00324 struct_parm.treeStruct.nodes=allnewnodes; 00325 struct_parm.treeStruct.n_nodes=nodes.size(); 00326 struct_parm.num_classes = nodes.size(); //num_classes 00327 00328 //free 00329 nodes.clear(); 00330 tempnodes.clear(); 00331 } 00332 00333 DOC* CSVMSTRUCTTREE::CreateDoc(Sleipnir::CPCL &PCL, size_t iGene, size_t iDoc) { 00334 WORD* aWords; 00335 size_t i, j, iWord, iWords, iPCL, iExp; 00336 float d; 00337 DOC* pRet; 00338 pRet->fvec->words[0].weight; 00339 //get number of features 00340 iWords = PCL.GetExperiments(); 00341 //cerr<<"Newing WORDS "<<(iWords+1)*sizeof(WORD)<<endl; 00342 aWords = new WORD[iWords + 2]; 00343 //set the words 00344 for (i = 0; i < iWords; ++i) { 00345 aWords[i].wnum = i + 1; 00346 if (!Sleipnir::CMeta::IsNaN(d = PCL.Get(iGene, i))) 00347 aWords[i].weight = d; 00348 else 00349 aWords[i].weight = 0; 00350 } 00351 aWords[i].wnum=iWords+1; //add a constant feature anyway 00352 aWords[i].weight=1; 00353 aWords[i+1].wnum = 0; 00354 // cerr<<"START Create Example"<<endl; 00355 pRet = create_example(iDoc, 0, 0, 1, create_svector(aWords, "", 1)); 00356 //cerr<<"END create example"<<endl; 00357 delete[] aWords; 00358 return pRet; 00359 } 00360 00361 00362 //DOC* CSVMSTRUCTTREE::CreateDoc(Sleipnir::CDat& Dat, size_t iGene, size_t iDoc) { 00363 // WORD* aWords; 00364 // size_t i, j, iWord, iWords; 00365 // float d; 00366 // DOC* pRet; 00367 // pRet->fvec->words[0].weight; 00368 // //get number of features 00369 // iWords = Dat.GetGenes(); 00370 // // cout << "CD:iwords=" << iWords << endl; 00371 // aWords = new WORD[iWords + 1]; 00372 // //number the words 00373 // for (i = 0; i < iWords; ++i) { 00374 // // cout<<i<<endl; 00375 // aWords[i].wnum = i + 1; 00376 // // asWords[ i ].wnum = 0; 00377 // } 00378 // aWords[i].wnum = 0; 00379 // //get the values; 00380 // iWord = 0; 00381 // for (i = 0; i < Dat.GetGenes(); i++) { 00382 // if (!Sleipnir::CMeta::IsNaN(d = Dat.Get(iGene, i))) { 00383 // // if (i==0 && j==0) 00384 // // cout<<"First value is "<<d<<endl; 00385 // aWords[iWord].weight = d; 00386 // } else 00387 // aWords[iWord].weight = 0; 00388 // iWord++; 00389 // } 00390 // pRet = create_example(iDoc, 0, 0, 1, create_svector(aWords, "", 1)); 00391 // delete[] aWords; 00392 // // cout<<"done creating DOC"<<endl; 00393 // return pRet; 00394 //} 00395 00396 00397 vector<SVMLabel> CSVMSTRUCTTREE::ReadLabels(ifstream & ifsm) { 00398 static const size_t c_iBuffer = 65532; 00399 char acBuffer[c_iBuffer]; 00400 vector<string> vecstrTokens; 00401 vector<char> multilabels; 00402 vector<SVMLabel> vecLabels; 00403 ONTONODE *pnode; 00404 00405 if(struct_parm.num_classes==0) 00406 cerr<< "Ontology must be read before reading labels!"<<endl; 00407 else 00408 cerr<<struct_parm.num_classes<< " Classes Read!"<<endl; 00409 multilabels.resize(struct_parm.num_classes); 00410 map<string,int>::iterator it; 00411 while (!ifsm.eof()) { 00412 ifsm.getline(acBuffer, c_iBuffer - 1); 00413 acBuffer[c_iBuffer - 1] = 0; 00414 vecstrTokens.clear(); 00415 CMeta::Tokenize(acBuffer, vecstrTokens); 00416 if (vecstrTokens.empty()) 00417 continue; 00418 if (vecstrTokens.size() < 2) { 00419 cerr << "Illegal label line (" << vecstrTokens.size() << "): " 00420 << acBuffer << endl; 00421 continue; 00422 } 00423 00424 for (int i=1; i<multilabels.size();i++) 00425 multilabels[i]=0; 00426 multilabels[0]=1; //root node is always on 00427 for(int i=1; i < vecstrTokens.size();i++){ 00428 it = onto_map.find(vecstrTokens[i]); 00429 if(it == onto_map.end()){ 00430 if(struct_verbosity>=2) 00431 cerr<< "Unknown term: "<<vecstrTokens[i]<<endl; 00432 } 00433 else{ 00434 multilabels[onto_map[vecstrTokens[i]]]=1; 00435 struct_parm.treeStruct.nodes[ onto_map[vecstrTokens[i]] ]->inputlabelCount++; 00436 if(struct_verbosity>=3) 00437 cout<<vecstrTokens[0]<<'\t'<<vecstrTokens[i]; 00438 //label propagation; add print propagation process 00439 pnode=struct_parm.treeStruct.nodes[onto_map[vecstrTokens[i]]]->parent; 00440 while(pnode && multilabels[pnode->index]!=1){ 00441 multilabels[pnode->index]=1; 00442 struct_parm.treeStruct.nodes[pnode->index]->inputlabelCount++; 00443 if(struct_verbosity>=3) 00444 cout<<'\t'<<onto_map_rev[pnode->index]; 00445 pnode = struct_parm.treeStruct.nodes[pnode->index]->parent; 00446 } 00447 if(struct_verbosity>=3) 00448 cout<<endl; 00449 //end label propagation 00450 00451 } 00452 } 00453 preprocessLabel(&multilabels); 00454 vecLabels.push_back(SVMArc::SVMLabel(vecstrTokens[0], multilabels)); 00455 } 00456 return vecLabels; 00457 } 00458 00459 void CSVMSTRUCTTREE::preprocessLabel(vector<char>* multilabels){ 00460 int i,iclass,flag_childrenannotated; 00461 for ( iclass=0; iclass < multilabels->size();iclass++){ 00462 if((*multilabels)[iclass]==1){ 00463 flag_childrenannotated = 0; 00464 for( i=0; i<struct_parm.treeStruct.nodes[iclass]->n_children; i++){ 00465 if((*multilabels)[struct_parm.treeStruct.nodes[iclass]->children[i]->index]==1){ 00466 flag_childrenannotated=1; 00467 break; 00468 } 00469 } 00470 if(flag_childrenannotated==0){ 00471 vecsetZero(struct_parm.treeStruct.nodes[iclass],multilabels,2); 00472 (*multilabels)[iclass]=1; 00473 } 00474 } 00475 } 00476 00477 00478 } 00479 00480 void CSVMSTRUCTTREE::vecsetZero (ONTONODE* node, vector<char>* ybar0,char zero) { 00481 //printf("setZero\n"); 00482 00483 int i; 00484 if((*ybar0)[node->index]!=zero){ 00485 (*ybar0)[node->index] = zero; 00486 for(i=0; i < node->n_children; i++) 00487 if((*ybar0)[node->children[i]->index]!=zero) 00488 vecsetZero(node->children[i], ybar0,zero); 00489 } 00490 } 00491 00492 void CSVMSTRUCTTREE::InitializeLikAfterReadLabels() { 00493 struct_parm.condLikelihood = (double*)my_malloc(sizeof(double)*struct_parm.num_classes); 00494 struct_parm.condLikelihood[0] = 0; // now the first term in ontofile has to be the 'head node', change this to make code more robust 00495 for(int i=1; i<struct_parm.num_classes;i++){ 00496 if(struct_parm.treeStruct.nodes[i]->inputlabelCount>0){ 00497 struct_parm.treeStruct.nodes[i]->posBalanceWeight = (struct_parm.treeStruct.nodes[0]->inputlabelCount/2)/ struct_parm.treeStruct.nodes[i]->inputlabelCount; 00498 struct_parm.treeStruct.nodes[i]->negBalanceWeight = (struct_parm.treeStruct.nodes[0]->inputlabelCount/2)/ (struct_parm.treeStruct.nodes[0]->inputlabelCount-struct_parm.treeStruct.nodes[i]->inputlabelCount); 00499 }else{ 00500 struct_parm.treeStruct.nodes[i]->posBalanceWeight = 0; 00501 struct_parm.treeStruct.nodes[i]->negBalanceWeight = 0; 00502 } 00503 struct_parm.condLikelihood[i] = log(struct_parm.treeStruct.nodes[i]->parent->inputlabelCount + 1) 00504 - log(struct_parm.treeStruct.nodes[i]->inputlabelCount + 1); 00505 } 00506 } 00507 SAMPLE* CSVMSTRUCTTREE::CreateSample(Sleipnir::CPCL &PCL, vector<SVMLabel> SVMLabels) { 00508 size_t i, j, iGene, iDoc; 00509 int n; /* number of examples */ 00510 vector<char*> target; 00511 char* newmultilabel; 00512 long num_classes=0; 00513 SAMPLE* pSample = new SAMPLE; 00514 EXAMPLE* examples; 00515 DOC** docs; 00516 vector<DOC*> vec_pDoc; 00517 vec_pDoc.reserve(SVMLabels.size()); 00518 vector< vector<char> > vecClass; 00519 vecClass.reserve(SVMLabels.size()); 00520 iDoc = 0; 00521 00522 for (i = 0; i < SVMLabels.size(); i++) { 00523 // cout<< "processing gene " << SVMLabels[i].GeneName << endl; 00524 if (!SVMLabels[i].hasIndex) { 00525 SVMLabels[i].SetIndex(PCL.GetGene(SVMLabels[i].GeneName)); 00526 } 00527 iGene = SVMLabels[i].index; 00528 // cout << SVMLabels[i].GeneName<<" gene at location "<<iGene << endl; 00529 if (iGene != -1) { 00530 // cout << "creating doc" << endl; 00531 iDoc++; 00532 vec_pDoc.push_back(CreateDoc(PCL, iGene, iDoc - 1)); 00533 vecClass.push_back(SVMLabels[i].TargetM); 00534 } 00535 } 00536 00537 00538 //copy patterns and labels to new vector 00539 docs = new DOC*[vec_pDoc.size()]; 00540 n = vec_pDoc.size(); 00541 //cout << "Read in " << n << "Standards"<<endl; 00542 copy(vec_pDoc.begin(), vec_pDoc.end(), docs); 00543 vec_pDoc.clear(); 00544 00545 //cerr << "NEW Class array" << endl; 00546 target.resize(vecClass.size()); 00547 for (i= 0; i<vecClass.size();i++) 00548 { 00549 newmultilabel = (char*)my_malloc(sizeof(char)*vecClass[i].size()); 00550 copy(vecClass[i].begin(),vecClass[i].end(),newmultilabel); 00551 target[i] = newmultilabel; 00552 } 00553 vecClass.clear(); 00554 00555 examples=(EXAMPLE *)my_malloc(sizeof(EXAMPLE)*n); 00556 00557 for(i=0; i<n; i++) { /* copy docs over into new datastructure */ 00558 examples[i].x.doc=docs[i]; 00559 examples[i].y.Class=target[i]; 00560 //examples[i].y.scores=NULL; 00561 examples[i].y.num_classes= struct_parm.num_classes; 00562 } 00563 target.clear(); 00564 delete(docs); 00565 pSample->n=n; 00566 pSample->examples=examples; 00567 00568 if(struct_verbosity>=0) 00569 printf(" (%d examples) ",pSample->n); 00570 00571 return pSample; 00572 //cerr<<"DONE CreateSample"<<endl; 00573 } 00574 00575 // 00576 // SAMPLE* CSVMSTRUCTTREE::CreateSample(Sleipnir::CDat& Dat, vector<SVMLabel> SVMLabels) { 00577 // size_t i, j, iGene, iDoc; 00578 // vector<DOC*> vec_pDoc; 00579 // vector<double> vecClass; 00580 // vector<size_t> veciGene; 00581 // iDoc = 0; 00582 // float numPositives, numNegatives; 00583 // numPositives = numNegatives = 0; 00584 // for (i = 0; i < SVMLabels.size(); i++) { 00585 // // cout<< "processing gene " << SVMLabels[i].GeneName << endl; 00586 // iGene = Dat.GetGene(SVMLabels[i].GeneName); 00587 // // cout << SVMLabels[i].GeneName<<" gene at location "<<iGene << endl; 00588 // if (iGene != -1) { 00589 // // cout << "creating doc" << endl; 00590 // iDoc++; 00591 // vec_pDoc.push_back(CreateDoc(Dat, iGene, iDoc - 1)); 00592 // vecClass.push_back(SVMLabels[i].Target); 00593 // } 00594 // } 00595 // 00596 // DOC** ppDoc; 00597 // ppDoc = new DOC*[vec_pDoc.size()]; 00598 // copy(vec_pDoc.begin(), vec_pDoc.end(), ppDoc); 00599 // vec_pDoc.clear(); 00600 // PATTERN* pPattern = new PATTERN; 00601 // pPattern->doc = ppDoc; 00602 // 00603 // pPattern->totdoc = iDoc; 00604 // // cout << "number of document=" << pPattern->totdoc << endl; 00605 // LABEL* pLabel = new LABEL; 00606 // double* aClass; 00607 // aClass = new double[vecClass.size()]; 00608 // copy(vecClass.begin(), vecClass.end(), aClass); 00609 // vecClass.clear(); 00610 // pLabel->Class = aClass; 00611 // pLabel->totdoc = iDoc; 00612 // 00613 // EXAMPLE* aExample; 00614 // aExample = new EXAMPLE[1]; 00615 // //cout<<"aExample @"<<aExample<<endl; 00616 // aExample[0].x = *pPattern; 00617 // aExample[0].y = *pLabel; 00618 // SAMPLE* pSample = new SAMPLE; 00619 // pSample->n = 1; 00620 // pSample->examples = aExample; 00621 // /* cout << "examples @" << pSample->examples << endl; 00622 // cout<< "ppDoc="<<ppDoc<<endl; 00623 // cout << "docs @" << pSample->examples[0].x.doc << endl; 00624 // cout<<"done creating sample"<<endl; 00625 // cout<<"sample @ "<<pSample<<endl;*/ 00626 // return pSample; 00627 //} 00628 00629 //Single gene classification 00630 00631 vector<Result> CSVMSTRUCTTREE::Classify(Sleipnir::CPCL &PCL, 00632 vector<SVMLabel> SVMLabels) { 00633 size_t i, j,k, iGene, iDoc; 00634 vector<int> vecClass; 00635 vector<Result> vecResult; 00636 iDoc = 0; 00637 PATTERN pattern; 00638 pattern.totdoc = 1; 00639 cerr << "CLASSIFY classifying " << endl; 00640 LABEL label; 00641 for (i = 0; i < SVMLabels.size(); i++) { 00642 if (!SVMLabels[i].hasIndex) { 00643 SVMLabels[i].SetIndex(PCL.GetGene(SVMLabels[i].GeneName)); 00644 } 00645 iGene = SVMLabels[i].index; 00646 if (iGene != -1) { 00647 iDoc++; 00648 00649 pattern.doc = CreateDoc(PCL, iGene, iDoc); 00650 label = classify_struct_example(pattern, &structmodel, 00651 &struct_parm); 00652 vecClass.push_back(SVMLabels[i].Target); 00653 vecResult.resize(iDoc); 00654 vecResult[iDoc - 1].GeneName = SVMLabels[i].GeneName; 00655 vecResult[iDoc - 1].TargetM = SVMLabels[i].TargetM; 00656 vecResult[iDoc - 1].ValueM.reserve(struct_parm.num_classes); 00657 for (k = 0; k < struct_parm.num_classes; k++) 00658 vecResult[iDoc - 1].ValueM.push_back(label.Class[k]); 00659 00660 vecResult[iDoc - 1].num_class=struct_parm.num_classes; 00661 vecResult[iDoc - 1].Scores.reserve(struct_parm.num_classes); 00662 for (k = 0; k < struct_parm.num_classes; k++) 00663 vecResult[iDoc - 1].Scores.push_back(label.scores[k]); 00664 FreeDoc(pattern.doc); 00665 } 00666 } 00667 00668 return vecResult; 00669 } 00670 00671 00672 void CSVMSTRUCTTREE::FreeSample_leave_Doc(SAMPLE s){ 00673 /* Frees the memory of sample s. */ 00674 int i; 00675 for(i=0;i<s.n;i++) { 00676 free(s.examples[i].x.doc); 00677 free_label(s.examples[i].y); 00678 } 00679 free(s.examples); 00680 } 00681 00682 00683 00684 } 00685