Sleipnir
src/svmstructtree.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "svmstructtree.h"
00024 #include "pclset.h"
00025 #include "dataset.h"
00026 #include "meta.h"
00027 #include "genome.h"
00028 #include "compactmatrix.h"
00029 #include <vector>
00030 #include <set>
00031 
00032 #define  SLACK_RESCALING    1
00033 #define  MARGIN_RESCALING   2
00034 
00035 namespace SVMArc {
00036     //extern "C" {
00037     //  //    void free_struct_model(STRUCTMODEL sm);
00038     //  void free_struct_sample(SAMPLE s);
00039     //  //    void svm_learn_struct_joint_custom(SAMPLE sample,
00040     //  //            STRUCT_LEARN_PARM *sparm,
00041     //  //            LEARN_PARM *lparm, KERNEL_PARM *kparm,
00042     //  //            STRUCTMODEL *sm);
00043     //  //    SAMPLE read_struct_examples_sleipnir(DOC **all_docs, double*all_labels, int example_size, int total_features, STRUCT_LEARN_PARM *sparm);
00044     //  //    void free_struct_model(STRUCTMODEL sm);
00045     //  //    void free_struct_sample(SAMPLE s);
00046     //  //    void set_struct_verbosity(long verb);
00047     //  //    double estimate_r_delta_average(DOC **, long, KERNEL_PARM *);
00048     //  //    MODEL *read_model(char *);
00049     //  LABEL classify_struct_example(PATTERN x, STRUCTMODEL *sm,
00050     //      STRUCT_LEARN_PARM *sparm);
00051     //  DOC* create_example(long, long, long, double, SVECTOR *);
00052     //  SVECTOR * create_svector(WORD *, char *, double);
00053     //  void set_struct_verbosity(long verb);
00054 
00055     //}
00056 
00057     void CSVMSTRUCTTREE::SetVerbosity(size_t V) {
00058         struct_verbosity = (long) V;
00059         //if( struct_verbosity>1)
00060         //  struct_verbosity=1;
00061     }
00062 
00063     bool CSVMSTRUCTTREE::initialize() {
00064 
00065         //set directionality
00066 
00067 
00068         /* set default */
00069         Alg = DEFAULT_ALG_TYPE;
00070         //Learn_parms
00071         struct_parm.C=-0.01;
00072         struct_parm.slack_norm=1;
00073         struct_parm.epsilon=DEFAULT_EPS;
00074         struct_parm.custom_argc=0;
00075         struct_parm.loss_function=DEFAULT_LOSS_FCT;
00076         struct_parm.loss_type=DEFAULT_RESCALING;
00077         struct_parm.newconstretrain=100;
00078         struct_parm.ccache_size=5;
00079         struct_parm.batch_size=100;
00080         //Learn_parms
00081         //strcpy (learn_parm.predfile, "trans_predictions");
00082         strcpy(learn_parm.alphafile, "");
00083         //verbosity=0;/*verbosity for svm_light*/
00084         //struct_verbosity = 1; /*verbosity for struct learning portion*/
00085         learn_parm.biased_hyperplane=1;
00086         learn_parm.remove_inconsistent=0;
00087         learn_parm.skip_final_opt_check=0;
00088         learn_parm.svm_maxqpsize=10;
00089         learn_parm.svm_newvarsinqp=0;
00090         learn_parm.svm_iter_to_shrink=-9999;
00091         learn_parm.maxiter=100000;
00092         learn_parm.kernel_cache_size=40;
00093         learn_parm.svm_c=99999999;  /* overridden by struct_parm.C */
00094         learn_parm.eps=0.001;       /* overridden by struct_parm.epsilon */
00095         learn_parm.transduction_posratio=-1.0;
00096         learn_parm.svm_costratio=1.0;
00097         learn_parm.svm_costratio_unlab=1.0;
00098         learn_parm.svm_unlabbound=1E-5;
00099         learn_parm.epsilon_crit=0.001;
00100         learn_parm.epsilon_a=1E-10;  /* changed from 1e-15 */
00101         learn_parm.compute_loo=0;
00102         learn_parm.rho=1.0;
00103         learn_parm.xa_depth=0;
00104         kernel_parm.kernel_type=0;
00105         kernel_parm.poly_degree=3;
00106         kernel_parm.rbf_gamma=1.0;
00107         kernel_parm.coef_lin=1;
00108         kernel_parm.coef_const=1;
00109         strcpy(kernel_parm.custom, "empty");
00110 
00111         if (learn_parm.svm_iter_to_shrink == -9999) {
00112             learn_parm.svm_iter_to_shrink = 100;
00113         }
00114 
00115         if ((learn_parm.skip_final_opt_check)
00116             && (kernel_parm.kernel_type == LINEAR)) {
00117                 printf(
00118                     "\nIt does not make sense to skip the final optimality check for linear kernels.\n\n");
00119                 learn_parm.skip_final_opt_check = 0;
00120         }
00121 
00122         //struct parms
00123 
00124         /* set number of features to -1, indicating that it will be computed
00125         in init_struct_model() */
00126         struct_parm.num_features = -1;
00127 
00128         return true;
00129     }
00130 
00131     bool CSVMSTRUCTTREE::parms_check() {
00132         if ((learn_parm.skip_final_opt_check) && (learn_parm.remove_inconsistent)) {
00133             fprintf(
00134                 stderr,
00135                 "\nIt is necessary to do the final optimality check when removing inconsistent \nexamples.\n");
00136             return false;
00137         }
00138         if ((learn_parm.svm_maxqpsize < 2)) {
00139             fprintf(
00140                 stderr,
00141                 "\nMaximum size of QP-subproblems not in valid range: %ld [2..]\n",
00142                 learn_parm.svm_maxqpsize);
00143             return false;
00144         }
00145         if ((learn_parm.svm_maxqpsize < learn_parm.svm_newvarsinqp)) {
00146             fprintf(
00147                 stderr,
00148                 "\nMaximum size of QP-subproblems [%ld] must be larger than the number of\n",
00149                 learn_parm.svm_maxqpsize);
00150             fprintf(
00151                 stderr,
00152                 "new variables [%ld] entering the working set in each iteration.\n",
00153                 learn_parm.svm_newvarsinqp);
00154             return false;
00155         }
00156         if (learn_parm.svm_iter_to_shrink < 1) {
00157             fprintf(
00158                 stderr,
00159                 "\nMaximum number of iterations for shrinking not in valid range: %ld [1,..]\n",
00160                 learn_parm.svm_iter_to_shrink);
00161             return false;
00162         }
00163         if (struct_parm.C < 0) {
00164             fprintf(
00165                 stderr,
00166                 "\nTrade-off between training error and margin is not set (C<0)!\nC value will be set to default value. Clight = Cpef * 100 / n \n");
00167         }
00168         if (learn_parm.transduction_posratio > 1) {
00169             fprintf(stderr,
00170                 "\nThe fraction of unlabeled examples to classify as positives must\n");
00171             fprintf(stderr, "be less than 1.0 !!!\n\n");
00172             return false;
00173         }
00174         if (learn_parm.svm_costratio <= 0) {
00175             fprintf(stderr,
00176                 "\nThe COSTRATIO parameter must be greater than zero!\n\n");
00177             return false;
00178         }
00179         if (struct_parm.epsilon <= 0) {
00180             fprintf(stderr,
00181                 "\nThe epsilon parameter must be greater than zero!\n\n");
00182             return false;
00183         }
00184         if ((struct_parm.slack_norm < 1) || (struct_parm.slack_norm > 2)) {
00185             fprintf(stderr,
00186                 "\nThe norm of the slacks must be either 1 (L1-norm) or 2 (L2-norm)!\n\n");
00187             return false;
00188         }
00189 
00190         if (struct_parm.loss_type
00191             != MARGIN_RESCALING) {
00192                 fprintf(
00193                     stderr,
00194                     "\nThe loss type must be margin rescaling!\n\n");
00195                 return false;
00196         }
00197         if (struct_parm.num_classes<2){
00198             fprintf(
00199                 stderr,
00200                 "\nAt least two classes in label are required!\n\n");
00201             return false;
00202         }
00203         //if (struct_parm.num_features<1){
00204         //  fprintf(
00205         //      stderr,
00206         //      "\nAt least one feature is required!\n\n");
00207         //  return false;
00208         //}
00209         if (learn_parm.rho < 0) {
00210             fprintf(stderr,
00211                 "\nThe parameter rho for xi/alpha-estimates and leave-one-out pruning must\n");
00212             fprintf(stderr,
00213                 "be greater than zero (typically 1.0 or 2.0, see T. Joachims, Estimating the\n");
00214             fprintf(stderr,
00215                 "Generalization Performance of an SVM Efficiently, ICML, 2000.)!\n\n");
00216             return false;
00217         }
00218         if ((learn_parm.xa_depth < 0) || (learn_parm.xa_depth > 100)) {
00219             fprintf(stderr,
00220                 "\nThe parameter depth for ext. xi/alpha-estimates must be in [0..100] (zero\n");
00221             fprintf(stderr,
00222                 "for switching to the conventional xa/estimates described in T. Joachims,\n");
00223             fprintf(
00224                 stderr,
00225                 "Estimating the Generalization Performance of an SVM Efficiently, ICML, 2000.)\n");
00226             return false;
00227         }
00228 
00229 
00230 
00231         return true;
00232     }
00233 
00234     void CSVMSTRUCTTREE::ReadOntology(const char* treefile) {
00235         vector<ONTONODE*> nodes;
00236         vector<TEMPNODE> tempnodes;
00237         ONTONODE* newnode;
00238         TEMPNODE newtempnode;
00239         int currentindex=0;
00240         ifstream ifsm;
00241         ifsm.clear();
00242         ifsm.open(treefile);
00243         if (!ifsm.is_open()){
00244             cerr << "Could not read Onto file" << endl;
00245             exit(1);
00246         }
00247         static const size_t c_iBuffer = 1024; //change this if not enough
00248         char acBuffer[c_iBuffer];
00249         vector<string> vecstrTokens;
00250         map<string,int>::iterator it;
00251 
00252 
00253         while (!ifsm.eof()) {
00254 
00255             /*read in text file */
00256             ifsm.getline(acBuffer, c_iBuffer - 1);
00257             acBuffer[c_iBuffer - 1] = 0;
00258             vecstrTokens.clear();
00259             CMeta::Tokenize(acBuffer, vecstrTokens);
00260 
00261             if (vecstrTokens.empty())
00262                 continue;
00263             if (vecstrTokens.size() < 2) {
00264                 cerr << "Illegal line (" << vecstrTokens.size() << "): "
00265                     << acBuffer << endl;
00266                 continue;
00267             }
00268 
00269             //construct tree; correctness check of file is not writen yet;
00270             //construct string to index mapping onto_map
00271             it= onto_map.find(vecstrTokens[0]);
00272             if(it == onto_map.end()){
00273                 currentindex = onto_map.size();
00274                 onto_map[vecstrTokens[0]]=currentindex;
00275                 onto_map_rev[currentindex]=vecstrTokens[0];
00276                 newnode= (ONTONODE *)my_malloc(sizeof(ONTONODE));
00277                 cerr << "Read new Onto Term: "<< vecstrTokens[0]<<endl;
00278                 nodes.push_back(newnode);
00279                 tempnodes.push_back(newtempnode);
00280                 //shall I add node name to tree structure?
00281             }
00282             for (int i=1; i < vecstrTokens.size();i++){
00283                 it= onto_map.find(vecstrTokens[i]);
00284                 if(it == onto_map.end()) {
00285                     currentindex = onto_map.size();
00286                     onto_map[vecstrTokens[i]]=currentindex;
00287                     cerr << "Read new Onto Term: "<< vecstrTokens[i]<<endl;
00288                     onto_map_rev[currentindex]=vecstrTokens[i];
00289                     newnode= (ONTONODE *)my_malloc(sizeof(ONTONODE));
00290                     nodes.push_back(newnode);
00291                     tempnodes.push_back(newtempnode);
00292                 }
00293 
00294                 nodes[onto_map[vecstrTokens[i]]]->parent =  nodes[onto_map[vecstrTokens[0]]];
00295                 tempnodes[onto_map[vecstrTokens[0]]].children.insert( nodes[onto_map[vecstrTokens[i]]]);
00296 
00297             }
00298 
00299 
00300         }
00301         ONTONODE** newchildren;
00302 
00303         for(int i=0; i < nodes.size(); i++){
00304             nodes[i]->n_children=tempnodes[i].children.size();
00305             //copy children
00306             newchildren = (ONTONODE **)my_malloc(sizeof(ONTONODE*)*nodes[i]->n_children);
00307             copy(tempnodes[i].children.begin(),tempnodes[i].children.end(),newchildren);
00308             nodes[i]->children = newchildren;
00309             //fill in ontology struct parameters
00310             nodes[i]->index=i; //index
00311             nodes[i]->inputlabelCount = 0;
00312             if(nodes[i]->n_children==0) //isLeafnode
00313                 nodes[i]->isLeafnode=1;
00314             else
00315                 nodes[i]->isLeafnode=0;
00316             nodes[i]->weight = 1;
00317         }
00318 
00319         //copy all nodes to a C type array
00320         ONTONODE** allnewnodes = (ONTONODE **)my_malloc(sizeof(ONTONODE*)*nodes.size());
00321         copy(nodes.begin(),nodes.end(),allnewnodes);
00322 
00323         /*pass the tree to struct_parm*/
00324         struct_parm.treeStruct.nodes=allnewnodes;
00325         struct_parm.treeStruct.n_nodes=nodes.size();
00326         struct_parm.num_classes = nodes.size(); //num_classes
00327 
00328         //free
00329         nodes.clear();
00330         tempnodes.clear();
00331     }
00332 
00333     DOC* CSVMSTRUCTTREE::CreateDoc(Sleipnir::CPCL &PCL, size_t iGene, size_t iDoc) {
00334         WORD* aWords;
00335         size_t i, j, iWord, iWords, iPCL, iExp;
00336         float d;
00337         DOC* pRet;
00338         pRet->fvec->words[0].weight;
00339         //get number of features
00340         iWords = PCL.GetExperiments();
00341         //cerr<<"Newing WORDS "<<(iWords+1)*sizeof(WORD)<<endl;
00342         aWords = new WORD[iWords + 2];
00343         //set the words
00344         for (i = 0; i < iWords; ++i) {
00345             aWords[i].wnum = i + 1;
00346             if (!Sleipnir::CMeta::IsNaN(d = PCL.Get(iGene, i)))
00347                 aWords[i].weight = d;
00348             else
00349                 aWords[i].weight = 0;
00350         }
00351         aWords[i].wnum=iWords+1; //add a constant feature anyway
00352         aWords[i].weight=1;
00353         aWords[i+1].wnum = 0;
00354         // cerr<<"START Create Example"<<endl;
00355         pRet = create_example(iDoc, 0, 0, 1, create_svector(aWords, "", 1));
00356         //cerr<<"END create example"<<endl;
00357         delete[] aWords;
00358         return pRet;
00359     }
00360 
00361     
00362 //DOC* CSVMSTRUCTTREE::CreateDoc(Sleipnir::CDat& Dat, size_t iGene, size_t iDoc) {
00363 //  WORD* aWords;
00364 //  size_t i, j, iWord, iWords;
00365 //  float d;
00366 //  DOC* pRet;
00367 //  pRet->fvec->words[0].weight;
00368 //  //get number of features
00369 //  iWords = Dat.GetGenes();
00370 //  //      cout << "CD:iwords=" << iWords << endl;
00371 //  aWords = new WORD[iWords + 1];
00372 //  //number the words
00373 //  for (i = 0; i < iWords; ++i) {
00374 //      //   cout<<i<<endl;
00375 //      aWords[i].wnum = i + 1;
00376 //      // asWords[ i ].wnum = 0;
00377 //  }
00378 //  aWords[i].wnum = 0;
00379 //  //get the values;
00380 //  iWord = 0;
00381 //  for (i = 0; i < Dat.GetGenes(); i++) {
00382 //      if (!Sleipnir::CMeta::IsNaN(d = Dat.Get(iGene, i))) {
00383 //          //   if (i==0 && j==0)
00384 //          //       cout<<"First value is "<<d<<endl;
00385 //          aWords[iWord].weight = d;
00386 //      } else
00387 //          aWords[iWord].weight = 0;
00388 //      iWord++;
00389 //  }
00390 //  pRet = create_example(iDoc, 0, 0, 1, create_svector(aWords, "", 1));
00391 //  delete[] aWords;
00392 //  // cout<<"done creating DOC"<<endl;
00393 //  return pRet;
00394 //}
00395 
00396 
00397     vector<SVMLabel> CSVMSTRUCTTREE::ReadLabels(ifstream & ifsm) {
00398         static const size_t c_iBuffer = 65532;
00399         char acBuffer[c_iBuffer];
00400         vector<string> vecstrTokens;
00401         vector<char> multilabels;
00402         vector<SVMLabel> vecLabels;
00403         ONTONODE *pnode;
00404 
00405         if(struct_parm.num_classes==0)
00406             cerr<< "Ontology must be read before reading labels!"<<endl;
00407         else
00408             cerr<<struct_parm.num_classes<< " Classes Read!"<<endl;
00409         multilabels.resize(struct_parm.num_classes);
00410         map<string,int>::iterator it;
00411         while (!ifsm.eof()) {
00412             ifsm.getline(acBuffer, c_iBuffer - 1);
00413             acBuffer[c_iBuffer - 1] = 0;
00414             vecstrTokens.clear();
00415             CMeta::Tokenize(acBuffer, vecstrTokens);
00416             if (vecstrTokens.empty())
00417                 continue;
00418             if (vecstrTokens.size() < 2) {
00419                 cerr << "Illegal label line (" << vecstrTokens.size() << "): "
00420                     << acBuffer << endl;
00421                 continue;
00422             }
00423 
00424             for (int i=1; i<multilabels.size();i++)
00425                 multilabels[i]=0;
00426             multilabels[0]=1; //root node is always on
00427             for(int i=1; i < vecstrTokens.size();i++){
00428                 it =  onto_map.find(vecstrTokens[i]);
00429                 if(it == onto_map.end()){
00430                     if(struct_verbosity>=2)
00431                         cerr<< "Unknown term: "<<vecstrTokens[i]<<endl;
00432                 }
00433                 else{
00434                     multilabels[onto_map[vecstrTokens[i]]]=1; 
00435                     struct_parm.treeStruct.nodes[ onto_map[vecstrTokens[i]] ]->inputlabelCount++;
00436                     if(struct_verbosity>=3) 
00437                         cout<<vecstrTokens[0]<<'\t'<<vecstrTokens[i];
00438                     //label propagation; add print propagation process
00439                     pnode=struct_parm.treeStruct.nodes[onto_map[vecstrTokens[i]]]->parent;      
00440                     while(pnode && multilabels[pnode->index]!=1){
00441                         multilabels[pnode->index]=1;
00442                         struct_parm.treeStruct.nodes[pnode->index]->inputlabelCount++;
00443                         if(struct_verbosity>=3) 
00444                             cout<<'\t'<<onto_map_rev[pnode->index];
00445                         pnode = struct_parm.treeStruct.nodes[pnode->index]->parent;
00446                     }
00447                     if(struct_verbosity>=3)
00448                         cout<<endl;
00449                     //end label propagation
00450 
00451                 }
00452             }
00453             preprocessLabel(&multilabels);
00454             vecLabels.push_back(SVMArc::SVMLabel(vecstrTokens[0], multilabels));
00455         }
00456         return vecLabels;
00457     }
00458 
00459     void CSVMSTRUCTTREE::preprocessLabel(vector<char>* multilabels){
00460         int i,iclass,flag_childrenannotated;
00461         for ( iclass=0; iclass < multilabels->size();iclass++){
00462             if((*multilabels)[iclass]==1){
00463                 flag_childrenannotated = 0;
00464                 for( i=0; i<struct_parm.treeStruct.nodes[iclass]->n_children; i++){
00465                     if((*multilabels)[struct_parm.treeStruct.nodes[iclass]->children[i]->index]==1){
00466                         flag_childrenannotated=1;
00467                         break;
00468                     }
00469                 }
00470                 if(flag_childrenannotated==0){
00471                     vecsetZero(struct_parm.treeStruct.nodes[iclass],multilabels,2);
00472                     (*multilabels)[iclass]=1;   
00473                 }
00474             }
00475         }
00476 
00477 
00478     }
00479 
00480     void CSVMSTRUCTTREE::vecsetZero (ONTONODE* node, vector<char>* ybar0,char zero) {
00481         //printf("setZero\n");
00482 
00483         int i;
00484         if((*ybar0)[node->index]!=zero){
00485             (*ybar0)[node->index] = zero;
00486             for(i=0; i < node->n_children; i++)
00487                 if((*ybar0)[node->children[i]->index]!=zero)
00488                     vecsetZero(node->children[i], ybar0,zero);
00489         }
00490     }
00491 
00492     void CSVMSTRUCTTREE::InitializeLikAfterReadLabels() {
00493         struct_parm.condLikelihood = (double*)my_malloc(sizeof(double)*struct_parm.num_classes);
00494         struct_parm.condLikelihood[0] = 0; // now the first term in ontofile has to be the 'head node', change this to make code more robust
00495         for(int i=1; i<struct_parm.num_classes;i++){
00496             if(struct_parm.treeStruct.nodes[i]->inputlabelCount>0){
00497                 struct_parm.treeStruct.nodes[i]->posBalanceWeight =  (struct_parm.treeStruct.nodes[0]->inputlabelCount/2)/ struct_parm.treeStruct.nodes[i]->inputlabelCount;
00498                 struct_parm.treeStruct.nodes[i]->negBalanceWeight =  (struct_parm.treeStruct.nodes[0]->inputlabelCount/2)/ (struct_parm.treeStruct.nodes[0]->inputlabelCount-struct_parm.treeStruct.nodes[i]->inputlabelCount);
00499             }else{
00500                 struct_parm.treeStruct.nodes[i]->posBalanceWeight = 0;
00501                 struct_parm.treeStruct.nodes[i]->negBalanceWeight = 0;
00502             }
00503             struct_parm.condLikelihood[i] = log(struct_parm.treeStruct.nodes[i]->parent->inputlabelCount + 1) 
00504                 - log(struct_parm.treeStruct.nodes[i]->inputlabelCount + 1);
00505         }
00506     }
00507     SAMPLE* CSVMSTRUCTTREE::CreateSample(Sleipnir::CPCL &PCL, vector<SVMLabel> SVMLabels) {
00508         size_t i, j, iGene, iDoc;
00509         int     n;       /* number of examples */
00510         vector<char*> target;
00511         char* newmultilabel;
00512         long num_classes=0;
00513         SAMPLE* pSample = new SAMPLE;
00514         EXAMPLE* examples;
00515         DOC** docs;
00516         vector<DOC*> vec_pDoc;
00517         vec_pDoc.reserve(SVMLabels.size());
00518         vector< vector<char> > vecClass;
00519         vecClass.reserve(SVMLabels.size());
00520         iDoc = 0;
00521 
00522         for (i = 0; i < SVMLabels.size(); i++) {
00523             //     cout<< "processing gene " << SVMLabels[i].GeneName << endl;
00524             if (!SVMLabels[i].hasIndex) {
00525                 SVMLabels[i].SetIndex(PCL.GetGene(SVMLabels[i].GeneName));
00526             }
00527             iGene = SVMLabels[i].index;
00528             //   cout << SVMLabels[i].GeneName<<" gene at location "<<iGene << endl;
00529             if (iGene != -1) {
00530                 //       cout << "creating doc" << endl;
00531                 iDoc++;
00532                 vec_pDoc.push_back(CreateDoc(PCL, iGene, iDoc - 1));
00533                 vecClass.push_back(SVMLabels[i].TargetM);
00534             }
00535         }
00536 
00537 
00538         //copy patterns and labels to new vector
00539         docs = new DOC*[vec_pDoc.size()];
00540         n = vec_pDoc.size();
00541         //cout << "Read in " << n << "Standards"<<endl;
00542         copy(vec_pDoc.begin(), vec_pDoc.end(), docs);
00543         vec_pDoc.clear();
00544 
00545         //cerr << "NEW Class array" << endl;
00546         target.resize(vecClass.size());
00547         for (i= 0; i<vecClass.size();i++)
00548         {
00549             newmultilabel = (char*)my_malloc(sizeof(char)*vecClass[i].size());
00550             copy(vecClass[i].begin(),vecClass[i].end(),newmultilabel);
00551             target[i] = newmultilabel;
00552         }
00553         vecClass.clear();
00554 
00555         examples=(EXAMPLE *)my_malloc(sizeof(EXAMPLE)*n);
00556 
00557         for(i=0; i<n; i++) {          /* copy docs over into new datastructure */
00558             examples[i].x.doc=docs[i];
00559             examples[i].y.Class=target[i];
00560             //examples[i].y.scores=NULL;
00561             examples[i].y.num_classes= struct_parm.num_classes;
00562         }
00563         target.clear();
00564         delete(docs);
00565         pSample->n=n;
00566         pSample->examples=examples;
00567 
00568         if(struct_verbosity>=0)
00569             printf(" (%d examples) ",pSample->n);
00570 
00571         return pSample;
00572         //cerr<<"DONE CreateSample"<<endl;
00573     }
00574 
00575 //
00576 //  SAMPLE* CSVMSTRUCTTREE::CreateSample(Sleipnir::CDat& Dat, vector<SVMLabel> SVMLabels) {
00577 //  size_t i, j, iGene, iDoc;
00578 //  vector<DOC*> vec_pDoc;
00579 //  vector<double> vecClass;
00580 //  vector<size_t> veciGene;
00581 //  iDoc = 0;
00582 //  float numPositives, numNegatives;
00583 //  numPositives = numNegatives = 0;
00584 //  for (i = 0; i < SVMLabels.size(); i++) {
00585 //      //     cout<< "processing gene " << SVMLabels[i].GeneName << endl;
00586 //      iGene = Dat.GetGene(SVMLabels[i].GeneName);
00587 //      //   cout << SVMLabels[i].GeneName<<" gene at location "<<iGene << endl;
00588 //      if (iGene != -1) {
00589 //          //       cout << "creating doc" << endl;
00590 //          iDoc++;
00591 //          vec_pDoc.push_back(CreateDoc(Dat, iGene, iDoc - 1));
00592 //          vecClass.push_back(SVMLabels[i].Target);
00593 //      }
00594 //  }
00595 //
00596 //  DOC** ppDoc;
00597 //  ppDoc = new DOC*[vec_pDoc.size()];
00598 //  copy(vec_pDoc.begin(), vec_pDoc.end(), ppDoc);
00599 //  vec_pDoc.clear();
00600 //  PATTERN* pPattern = new PATTERN;
00601 //  pPattern->doc = ppDoc;
00602 //
00603 //  pPattern->totdoc = iDoc;
00604 //  //   cout << "number of document=" << pPattern->totdoc << endl;
00605 //  LABEL* pLabel = new LABEL;
00606 //  double* aClass;
00607 //  aClass = new double[vecClass.size()];
00608 //  copy(vecClass.begin(), vecClass.end(), aClass);
00609 //  vecClass.clear();
00610 //  pLabel->Class = aClass;
00611 //  pLabel->totdoc = iDoc;
00612 //
00613 //  EXAMPLE* aExample;
00614 //  aExample = new EXAMPLE[1];
00615 //  //cout<<"aExample @"<<aExample<<endl;
00616 //  aExample[0].x = *pPattern;
00617 //  aExample[0].y = *pLabel;
00618 //  SAMPLE* pSample = new SAMPLE;
00619 //  pSample->n = 1;
00620 //  pSample->examples = aExample;
00621 //  /* cout << "examples @" << pSample->examples << endl;
00622 //   cout<< "ppDoc="<<ppDoc<<endl;
00623 //   cout << "docs @" << pSample->examples[0].x.doc << endl;
00624 //   cout<<"done creating sample"<<endl;
00625 //   cout<<"sample @ "<<pSample<<endl;*/
00626 //  return pSample;
00627 //}
00628 
00629     //Single gene classification
00630 
00631     vector<Result> CSVMSTRUCTTREE::Classify(Sleipnir::CPCL &PCL,
00632         vector<SVMLabel> SVMLabels) {
00633             size_t i, j,k, iGene, iDoc;
00634             vector<int> vecClass;
00635             vector<Result> vecResult;
00636             iDoc = 0;
00637             PATTERN pattern;
00638             pattern.totdoc = 1;
00639             cerr << "CLASSIFY classifying " << endl;
00640             LABEL label;
00641             for (i = 0; i < SVMLabels.size(); i++) {
00642                 if (!SVMLabels[i].hasIndex) {
00643                     SVMLabels[i].SetIndex(PCL.GetGene(SVMLabels[i].GeneName));
00644                 }
00645                 iGene = SVMLabels[i].index;
00646                 if (iGene != -1) {
00647                     iDoc++;
00648 
00649                     pattern.doc = CreateDoc(PCL, iGene, iDoc);
00650                     label   = classify_struct_example(pattern, &structmodel,
00651                         &struct_parm);
00652                     vecClass.push_back(SVMLabels[i].Target);
00653                     vecResult.resize(iDoc);
00654                     vecResult[iDoc - 1].GeneName = SVMLabels[i].GeneName;
00655                     vecResult[iDoc - 1].TargetM = SVMLabels[i].TargetM;
00656                     vecResult[iDoc - 1].ValueM.reserve(struct_parm.num_classes);
00657                     for (k = 0; k < struct_parm.num_classes; k++)
00658                         vecResult[iDoc - 1].ValueM.push_back(label.Class[k]);
00659 
00660                     vecResult[iDoc - 1].num_class=struct_parm.num_classes;
00661                     vecResult[iDoc - 1].Scores.reserve(struct_parm.num_classes);
00662                     for (k = 0; k < struct_parm.num_classes; k++)
00663                         vecResult[iDoc - 1].Scores.push_back(label.scores[k]);
00664                     FreeDoc(pattern.doc);
00665                 }
00666             }
00667 
00668             return vecResult;
00669     }
00670 
00671 
00672     void CSVMSTRUCTTREE::FreeSample_leave_Doc(SAMPLE s){
00673         /* Frees the memory of sample s. */
00674         int i;
00675         for(i=0;i<s.n;i++) {
00676             free(s.examples[i].x.doc);
00677             free_label(s.examples[i].y);
00678         }
00679         free(s.examples);
00680     }
00681 
00682 
00683 
00684 }
00685