Sleipnir
src/svm.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "svm.h"
00024 #include "pclset.h"
00025 #include "dataset.h"
00026 #include "meta.h"
00027 #include "genome.h"
00028 
00029 #ifndef NO_SVM_PERF
00030 
00031 extern "C" {
00032 KERNEL_CACHE* kernel_cache_init( long, long );
00033 void kernel_cache_cleanup( KERNEL_CACHE* );
00034 void svm_learn_classification( DOC**, double*, long, long, LEARN_PARM*, KERNEL_PARM*,
00035     KERNEL_CACHE*, MODEL*, double* );
00036 void svm_learn_regression( DOC**, double*, long, long, LEARN_PARM*, KERNEL_PARM*,
00037     KERNEL_CACHE**, MODEL* );
00038 void svm_learn_ranking( DOC**, double*, long, long, LEARN_PARM*, KERNEL_PARM*,
00039     KERNEL_CACHE**, MODEL* );
00040 void svm_learn_optimization( DOC**, double*, long, long, LEARN_PARM*, KERNEL_PARM*,
00041     KERNEL_CACHE*, MODEL*, double* );
00042 }
00043 
00044 namespace Sleipnir {
00045 
00046 bool read_documents_bin( char* szFile, DOC*** papDocs, double** padLabels,
00047     uint32_t* piWords, uint32_t* piDocs ) {
00048     char            szComment[ 1024 ];
00049     FILE*           pfileDoc;
00050     SVMPerf::WORD*  aWords;
00051     uint32_t        i, iDoc, iWord;
00052     float           d;
00053     float*          ad;
00054 
00055     g_CatSleipnir( ).info( "CSVM::read_documents_bin( ) Reading binary examples into memory..." );
00056 
00057 #pragma warning( disable : 4996 )
00058     if( !( pfileDoc = fopen( szFile, "rb" ) ) ) {
00059 #pragma warning( default : 4996 )
00060         g_CatSleipnir( ).error( "CSVM::read_documents_bin( ) Could not open: %s", szFile );
00061         return false; }
00062 
00063     fread( piWords, sizeof(*piWords), 1, pfileDoc );
00064     fread( piDocs, sizeof(*piDocs), 1, pfileDoc );
00065 
00066     (*papDocs) = (DOC**)my_malloc( sizeof(DOC*) * (*piDocs) );
00067     (*padLabels) = (double*)my_malloc( sizeof(double) * (*piDocs) );
00068     ad = (float*)my_malloc( sizeof(*ad) * (*piWords) );
00069     aWords = (SVMPerf::WORD*)my_malloc( sizeof(SVMPerf::WORD) * ( (*piWords) + 1 ) );
00070     for( iWord = 0; iWord < (*piWords); ++iWord )
00071         aWords[ iWord ].wnum = iWord + 1;
00072     aWords[ iWord ].wnum = 0;
00073     for( iDoc = 0; iDoc < (*piDocs); ++iDoc ) {
00074         if( !( iDoc % 100000 ) )
00075             g_CatSleipnir( ).info( "CSVM::read_documents_bin( ) Read %d/%d", iDoc, *piDocs );
00076         fread( &d, sizeof(d), 1, pfileDoc );
00077         (*padLabels)[ iDoc ] = d;
00078         fread( ad, sizeof(*ad), (*piWords), pfileDoc );
00079         for( iWord = 0; iWord < (*piWords); ++iWord )
00080             aWords[ iWord ].weight = ad[ iWord ];
00081         fread( &i, sizeof(i), 1, pfileDoc );
00082         if( i )
00083             fread( szComment, sizeof(*szComment), i, pfileDoc );
00084         szComment[ i ] = 0;
00085         (*papDocs)[ iDoc ] = create_example( iDoc, 0, 0, 1, create_svector( aWords, szComment, 1 ) ); }
00086     free( aWords );
00087     free( ad );
00088 
00089     fclose( pfileDoc );
00090     return true;
00091 }
00092 
00093 SVMPerf::WORD   CSVMImpl::s_asWords[ CSVMImpl::c_iWords ];
00094 
00095 CSVMImpl::SLearn::SLearn( ) {
00096 
00097     predfile[ 0 ] = 0;
00098     alphafile[ 0 ] = 0;
00099     biased_hyperplane = 1;
00100     sharedslack = 0;
00101     remove_inconsistent = 0;
00102     skip_final_opt_check = 0;
00103     svm_maxqpsize = 10;
00104     svm_newvarsinqp = 0;
00105     svm_iter_to_shrink = -1;
00106     maxiter = 100000;
00107     kernel_cache_size = 40;
00108     svm_c = 0;
00109     eps = 0.1;
00110     transduction_posratio = -1.0;
00111     svm_costratio = 0;
00112     svm_costratio_unlab = 1;
00113     svm_unlabbound = 1e-5;
00114     epsilon_crit = 0.001;
00115     epsilon_a = 1e-15;
00116     compute_loo = 0;
00117     rho = 1;
00118     xa_depth = 0;
00119     type = CLASSIFICATION; }
00120 
00121 CSVMImpl::SKernel::SKernel( ) {
00122 
00123     kernel_type = 0;
00124     poly_degree = 3;
00125     rbf_gamma = 1;
00126     coef_lin = 1;
00127     coef_const = 1;
00128     custom[ 0 ] = 0; }
00129 
00130 CSVMImpl::CSVMImpl( ) : m_apDocs(NULL), m_iDocs(0), m_adAlphas(NULL), m_iAlphas(0),
00131     m_pModel(NULL), m_adLabels(NULL) {
00132 
00133     verbosity = 2; }
00134 
00135 CSVMImpl::~CSVMImpl( ) {
00136 
00137     Reset( true, true, true ); }
00138 
00139 void CSVMImpl::Reset( bool fData, bool fModel, bool fAlphas ) {
00140     size_t  i;
00141 
00142     if( fModel && m_pModel ) {
00143         free_model( m_pModel, 0 );
00144         m_pModel = NULL; }
00145     if( fAlphas && m_adAlphas ) {
00146         free( m_adAlphas );
00147         m_adAlphas = NULL; }
00148     if( fData ) {
00149         if( m_apDocs ) {
00150             for( i = 0; i < m_iDocs; ++i )
00151                 free_example( m_apDocs[ i ], 1 );
00152             delete[] m_apDocs;
00153             m_apDocs = NULL; }
00154         if( m_adLabels ) {
00155             delete[] m_adLabels;
00156             m_adLabels = NULL; } } }
00157 
00158 size_t CSVMImpl::GetWords( const SData& sData ) const {
00159     size_t  i, iRet;
00160 
00161     switch( sData.m_eType ) {
00162         case SData::EPCLs:
00163             for( iRet = i = 0; i < sData.m_uData.m_pPCLs->GetPCLs( ); ++i )
00164                 iRet += sData.m_uData.m_pPCLs->Get( i ).GetExperiments( );
00165             return ( iRet * 2 );
00166 
00167         case SData::EData:
00168             return sData.m_uData.m_pData->GetExperiments( );
00169 
00170         case SData::EFile:
00171             return m_iWords;
00172 
00173         case SData::EPCL:
00174             return sData.m_uData.m_pPCL->GetExperiments( ); }
00175 
00176     return -1; }
00177 
00178 DOC* CSVMImpl::CreateDoc( const SData& sData, size_t iOne, size_t iTwo, size_t iDoc ) const {
00179     SVMPerf::WORD*  asWords;
00180     size_t          i, j, iWord, iWords;
00181     float           d;
00182     DOC*            pRet;
00183 
00184     iWords = GetWords( sData );
00185     asWords = ( iWords >= c_iWords ) ? new SVMPerf::WORD[ iWords + 1 ] : s_asWords;
00186     for( i = 0; i < iWords; ++i )
00187         asWords[ i ].wnum = i + 1;
00188     asWords[ i ].wnum = 0;
00189     if( sData.m_eType == SData::EPCLs ) {
00190         const CPCLSet&  PCLs    = *sData.m_uData.m_pPCLs;
00191 
00192         for( iWord = i = 0; i < PCLs.GetPCLs( ); ++i ) {
00193             for( j = 0; j < PCLs.Get( i ).GetExperiments( ); ++j ) {
00194                 if( CMeta::IsNaN( d = PCLs.Get( i, iOne, j ) ) )
00195                     d = 0;
00196                 assert( ( iWord + j ) < iWords );
00197                 asWords[ iWord + j ].weight = d;
00198                 if( CMeta::IsNaN( d = PCLs.Get( i, iTwo, j ) ) )
00199                     d = 0;
00200                 assert( ( iWord + ( iWords / 2 ) + j ) < iWords );
00201                 asWords[ iWord + ( iWords / 2 ) + j ].weight = d; }
00202             iWord += PCLs.Get( i ).GetExperiments( ); } }
00203     else {
00204         const IDataset* pData   = sData.m_uData.m_pData;
00205 
00206         for( i = 0; i < pData->GetExperiments( ); ++i ) {
00207             if( CMeta::IsNaN( d = pData->GetContinuous( iOne, iTwo, i ) ) )
00208                 d = 0;
00209             asWords[ i ].weight = d; } }
00210 
00211     pRet = create_example( iDoc, 0, 0, 1, create_svector( asWords, "", 1 ) );
00212     if( asWords != s_asWords )
00213         delete[] asWords;
00214     return pRet; }
00215 
00216 DOC* CSVMImpl::CreateDoc( const SData& sData, size_t iGene ) const {
00217     SVMPerf::WORD*  asWords;
00218     size_t          i, iWords;
00219     DOC*            pRet;
00220     float           d;
00221 
00222     iWords = GetWords( sData );
00223     asWords = ( iWords >= c_iWords ) ? new SVMPerf::WORD[ iWords + 1 ] : s_asWords;
00224     for( i = 0; i < iWords; ++i )
00225         asWords[ i ].wnum = i + 1;
00226     asWords[ i ].wnum = 0;
00227 
00228     for( i = 0; i < iWords; ++i )
00229         asWords[ i ].weight = CMeta::IsNaN( d = sData.m_uData.m_pPCL->Get( iGene, i ) ) ? 0 : d;
00230     pRet = create_example( i, 0, 0, 1, create_svector( asWords, "", 1 ) );
00231 
00232     if( asWords != s_asWords )
00233         delete[] asWords;
00234     return pRet; }
00235 
00249 bool CSVM::OpenAlphas( std::istream& istm ) {
00250     static const size_t c_iBuf  = 1024;
00251     char            szBuf[ c_iBuf ];
00252     vector<float>   vecdAlphas;
00253     size_t          i;
00254 
00255     Reset( false, false, true );
00256     while( istm.peek( ) != EOF ) {
00257         istm.getline( szBuf, c_iBuf - 1 );
00258         vecdAlphas.push_back( (float)atof( szBuf ) ); }
00259     m_adAlphas = new double[ m_iAlphas = vecdAlphas.size( ) ];
00260     for( i = 0; i < m_iAlphas; ++i )
00261         m_adAlphas[ i ] = vecdAlphas[ i ];
00262 
00263     return true; }
00264 
00265 bool CSVMImpl::Initialize( const SData& sData ) {
00266     size_t          i, j, iOne, iTwo, iDoc;
00267     vector<size_t>  veciGenes;
00268     float           d;
00269 
00270     Reset( true, false, false );
00271     if( sData.m_eType == SData::EFile ) {
00272         read_documents_bin( (char*)sData.m_uData.m_szFile, &m_apDocs, &m_adLabels,
00273             &m_iWords, &m_iDocs );
00274         return true; }
00275     if( sData.m_eType == SData::EPCL ) {
00276         for( m_iDocs = i = 0; i < sData.m_uData.m_pPCL->GetGenes( ); ++i )
00277             if( !sData.m_uData.m_pPCL->IsMasked( i ) && ( !sData.m_pNegative ||
00278                 sData.m_uAnswers.m_pGenes->IsGene( sData.m_uData.m_pPCL->GetGene( i ) ) ||
00279                 sData.m_pNegative->IsGene( sData.m_uData.m_pPCL->GetGene( i ) ) ) )
00280                 m_iDocs++;
00281         m_apDocs = new DOC*[ m_iDocs ];
00282         m_adLabels = new double[ m_iDocs ];
00283         for( i = j = 0; i < sData.m_uData.m_pPCL->GetGenes( ); ++i )
00284             if( !sData.m_uData.m_pPCL->IsMasked( i ) ) {
00285                 const string&   strGene = sData.m_uData.m_pPCL->GetGene( i );
00286 
00287                 d = 0;
00288                 if( !sData.m_pNegative )
00289                     d = sData.m_uAnswers.m_pGenes->IsGene( strGene ) ? 1.0f : -1.0f;
00290                 else if( sData.m_uAnswers.m_pGenes->IsGene( strGene ) )
00291                     d = 1;
00292                 else if( sData.m_pNegative->IsGene( strGene ) )
00293                     d = -1;
00294                 if( d ) {
00295                     m_apDocs[ j ] = CreateDoc( sData, i );
00296                     m_adLabels[ j++ ] = d; } }
00297         return true; }
00298 
00299     veciGenes.resize( ( sData.m_eType == SData::EPCLs ) ?
00300         sData.m_uData.m_pPCLs->GetGenes( ) : sData.m_uData.m_pData->GetGenes( ) );
00301     for( i = 0; i < veciGenes.size( ); ++i )
00302         veciGenes[ i ] = sData.m_uAnswers.m_pAnswers->GetGene( ( sData.m_eType == SData::EPCLs ) ?
00303             sData.m_uData.m_pPCLs->GetGene( i ) : sData.m_uData.m_pData->GetGene( i ) );
00304     for( m_iDocs = i = 0; i < veciGenes.size( ); ++i )
00305         if( ( iOne = veciGenes[ i ] ) != -1 )
00306             for( j = ( i + 1 ); j < veciGenes.size( ); ++j )
00307                 if( ( ( iTwo = veciGenes[ j ] ) != -1 ) &&
00308                     !CMeta::IsNaN( sData.m_uAnswers.m_pAnswers->Get( iOne, iTwo ) ) )
00309                     m_iDocs++;
00310     m_apDocs = new DOC*[ m_iDocs ];
00311     m_adLabels = new double[ m_iDocs ];
00312 
00313     for( iDoc = i = 0; i < veciGenes.size( ); ++i )
00314         if( ( iOne = veciGenes[ i ] ) != -1 )
00315             for( j = ( i + 1 ); j < veciGenes.size( ); ++j )
00316                 if( ( ( iTwo = veciGenes[ j ] ) != -1 ) &&
00317                     !CMeta::IsNaN( d = sData.m_uAnswers.m_pAnswers->Get( iOne, iTwo ) ) ) {
00318                     m_adLabels[ iDoc ] = d ? 1 : -1;
00319                     m_apDocs[ iDoc++ ] = CreateDoc( sData, i, j, iDoc ); }
00320     assert( iDoc == m_iDocs );
00321 
00322     return true; }
00323 
00341 bool CSVM::Learn( const CPCL& PCL, const CGenes& GenesPositive ) {
00342     CGenes  GenesNeg( GenesPositive.GetGenome( ) );
00343 
00344     return Learn( PCL, GenesPositive, GenesNeg ); }
00345 
00366 bool CSVM::Learn( const CPCL& PCL, const CGenes& GenesPositive, const CGenes& GenesNegative ) {
00367     SData   sData;
00368 
00369     sData.m_eType = SData::EPCL;
00370     sData.m_uData.m_pPCL = &PCL;
00371     sData.m_uAnswers.m_pGenes = &GenesPositive;
00372     sData.m_pNegative = GenesNegative.GetGenes( ) ? &GenesNegative : NULL;
00373 
00374     return CSVMImpl::Learn( sData ); }
00375 
00376 bool CSVMImpl::Learn( const SData& sData ) {
00377     KERNEL_CACHE*   pCache;
00378     size_t          i, iNeg, iPos, iWords;
00379 
00380     Reset( false, true, false );
00381     m_pModel = (MODEL*)calloc( 1, sizeof(*m_pModel) );
00382     if( !Initialize( sData ) )
00383         return false;
00384     if( !m_sLearn.svm_costratio ) {
00385         for( iNeg = iPos = i = 0; i < m_iDocs; ++i )
00386             if( m_adLabels[ i ] == 1 )
00387                 iPos++;
00388             else
00389                 iNeg++;
00390         m_sLearn.svm_costratio = (float)iNeg / iPos; }
00391     if( m_sLearn.svm_iter_to_shrink < 0 )
00392         m_sLearn.svm_iter_to_shrink = ( m_sKernel.kernel_type == LINEAR ) ? 2 : 100;
00393     iWords = GetWords( sData );
00394 
00395     pCache = ( m_sKernel.kernel_type == LINEAR ) ? NULL :
00396         kernel_cache_init( m_iDocs, m_sLearn.kernel_cache_size );
00397     switch( m_sLearn.type ) {
00398         case CLASSIFICATION:
00399             svm_learn_classification( m_apDocs, m_adLabels, m_iDocs, iWords,
00400                 (LEARN_PARM*)&m_sLearn, (KERNEL_PARM*)&m_sKernel, pCache, m_pModel,
00401                 m_adAlphas );
00402             break;
00403 
00404         case REGRESSION:
00405             svm_learn_regression( m_apDocs, m_adLabels, m_iDocs, iWords,
00406                 (LEARN_PARM*)&m_sLearn, (KERNEL_PARM*)&m_sKernel, &pCache, m_pModel );
00407             break;
00408 
00409         case RANKING:
00410             svm_learn_ranking( m_apDocs, m_adLabels, m_iDocs, iWords, (LEARN_PARM*)&m_sLearn,
00411                 (KERNEL_PARM*)&m_sKernel, &pCache, m_pModel );
00412             break;
00413 
00414         case OPTIMIZATION:
00415             svm_learn_optimization( m_apDocs, m_adLabels, m_iDocs, iWords,
00416                 (LEARN_PARM*)&m_sLearn, (KERNEL_PARM*)&m_sKernel, pCache, m_pModel,
00417                 m_adAlphas );
00418             break; }
00419 
00420     if( pCache )
00421         kernel_cache_cleanup( pCache );
00422 
00423     return true; }
00424 
00441 bool CSVM::Save( std::ostream& ostm ) const {
00442     size_t      i, j;
00443     SVECTOR*    pVec;
00444 
00445     if( !m_pModel )
00446         return false;
00447 
00448     ostm << "SVM-light Version " << VERSION << endl;
00449     ostm << m_pModel->kernel_parm.kernel_type << " # kernel type" << endl;
00450     ostm << m_pModel->kernel_parm.poly_degree << " # kernel parameter -d" << endl;
00451     ostm << m_pModel->kernel_parm.rbf_gamma << " # kernel parameter -g" << endl;
00452     ostm << m_pModel->kernel_parm.coef_lin << " # kernel parameter -s" << endl;
00453     ostm << m_pModel->kernel_parm.coef_const << " # kernel parameter -r" << endl;
00454     ostm << m_pModel->kernel_parm.custom << "# kernel parameter -u" << endl;
00455     ostm << m_pModel->totwords << " # highest feature index" << endl;
00456     ostm << m_pModel->totdoc << " # number of training documents" << endl;
00457  
00458     for( i = j = 1; i < (size_t)m_pModel->sv_num; ++i )
00459         for( pVec = m_pModel->supvec[ i ]->fvec; pVec; pVec = pVec->next )
00460             j++;
00461     ostm << j << " # number of support vectors plus 1" << endl;
00462     ostm << m_pModel->b <<
00463         " # threshold b, each following line is a SV (starting with alpha*y)" << endl;
00464 
00465     for( i = 1; i < (size_t)m_pModel->sv_num; ++i )
00466         for( pVec = m_pModel->supvec[ i ]->fvec; pVec; pVec = pVec->next ) {
00467             ostm << ( m_pModel->alpha[ i ] * pVec->factor ) << ' ';
00468             for( j = 0; pVec->words[ j ].wnum; ++j )
00469                 ostm << pVec->words[ j ].wnum << ':' << pVec->words[ j ].weight << ' ';
00470             ostm << '#' << endl; }
00471 //          ostm << '#' << pVec->userdefined << endl; }
00472 
00473     return true; }
00474 
00475 bool CSVMImpl::Evaluate( const SData& sData, const CGenes* pGenesIn, CDat& DatOut ) const {
00476     size_t  i, j, iGenes;
00477     DOC*    pDoc;
00478 
00479     if( !m_pModel )
00480         return false;
00481     if( m_pModel->kernel_parm.kernel_type == 0 )
00482         add_weight_vector_to_linear_model( m_pModel );
00483 
00484     if( sData.m_eType == SData::EFile )
00485         return EvaluateFile( sData.m_uData.m_szFile, DatOut );
00486 
00487     iGenes = ( sData.m_eType == SData::EPCLs ) ? sData.m_uData.m_pPCLs->GetGenes( ) :
00488         sData.m_uData.m_pData->GetGenes( );
00489     for( i = 0; i < iGenes; ++i ) {
00490         const string&   strGeneOne  = ( sData.m_eType == SData::EPCLs ) ?
00491             sData.m_uData.m_pPCLs->GetGene( i ) : sData.m_uData.m_pData->GetGene( i );
00492 
00493         if( !( i % 10 ) )
00494             g_CatSleipnir( ).notice( "CSVMImpl::Evaluate( ) gene %d/%d", i, iGenes );
00495         if( pGenesIn && !pGenesIn->IsGene( strGeneOne ) )
00496             continue;
00497         for( j = ( i + 1 ); j < iGenes; ++j ) {
00498             const string&   strGeneTwo  = ( sData.m_eType == SData::EPCLs ) ?
00499                 sData.m_uData.m_pPCLs->GetGene( j ) : sData.m_uData.m_pData->GetGene( j );
00500 
00501             if( pGenesIn && !pGenesIn->IsGene( strGeneTwo ) )
00502                 continue;
00503             if( !( pDoc = CreateDoc( sData, i, j, 0 ) ) )
00504                 return false;
00505             DatOut.Set( i, j, (float)( m_pModel->kernel_parm.kernel_type ?
00506                 classify_example( m_pModel, pDoc ) :
00507                 classify_example_linear( m_pModel, pDoc ) ) );
00508             free_example( pDoc, 1 ); } }
00509 
00510     return true; }
00511 
00527 bool CSVM::Evaluate( const CPCL& PCL, vector<float>& vecdResults ) const {
00528     size_t  i;
00529     DOC*    pDoc;
00530     SData   sData;
00531 
00532     if( !m_pModel )
00533         return false;
00534     if( m_pModel->kernel_parm.kernel_type == 0 )
00535         add_weight_vector_to_linear_model( m_pModel );
00536 
00537     sData.m_eType = SData::EPCL;
00538     sData.m_uData.m_pPCL = &PCL;
00539     for( i = 0; i < PCL.GetGenes( ); ++i ) {
00540         if( !( i % 1000 ) )
00541             g_CatSleipnir( ).notice( "CSVMImpl::Evaluate( ) gene %d/%d", i, PCL.GetGenes( ) );
00542         if( PCL.IsMasked( i ) )
00543             continue;
00544 
00545         if( !( pDoc = CreateDoc( sData, i ) ) )
00546             return false;
00547         vecdResults.push_back( (float)( m_pModel->kernel_parm.kernel_type ?
00548             classify_example( m_pModel, pDoc ) :
00549             classify_example_linear( m_pModel, pDoc ) ) );
00550         free_example( pDoc, 1 ); }
00551 
00552     return true; }
00553 
00554 bool CSVMImpl::EvaluateFile( const char* szFile, CDat& DatOut ) const {
00555     static const size_t c_iSize = 512;
00556     char            szGene[ c_iSize ];
00557     SVMPerf::WORD*  asWords;
00558     char*           pc;
00559     ifstream        ifsm;
00560     vector<string>  vecstrGenes;
00561     uint32_t        i, j, k, iDocs, iWords, iGenes;
00562     float*          ad;
00563     DOC*            pDoc;
00564 
00565     ifsm.open( szFile, ios_base::binary );
00566     if( !ifsm.is_open( ) )
00567         return false;
00568     ifsm.read( (char*)&iWords, sizeof(iWords) );
00569     ifsm.read( (char*)&iDocs, sizeof(iDocs) );
00570     ifsm.seekg( iDocs * ( ( ( iWords + 1 ) * sizeof(float) ) +
00571         ( 3 * sizeof(iDocs) ) ), ios_base::cur );
00572     ifsm.read( (char*)&iGenes, sizeof(iGenes) );
00573     vecstrGenes.resize( iGenes );
00574     for( i = 0; i < iGenes; ++i ) {
00575         for( pc = szGene; ; ++pc ) {
00576             ifsm.read( pc, 1 );
00577             if( !*pc )
00578                 break; }
00579         vecstrGenes[ i ] = szGene; }
00580     DatOut.Open( vecstrGenes );
00581 
00582     asWords = ( iWords >= c_iWords ) ? new SVMPerf::WORD[ iWords + 1 ] : s_asWords;
00583     for( i = 0; i < iWords; ++i )
00584         asWords[ i ].wnum = i + 1;
00585     asWords[ i ].wnum = 0;
00586 
00587     ad = new float[ iWords + 1 ];
00588     ifsm.seekg( 2 * sizeof(iDocs), ios_base::beg );
00589     for( i = 0; i < iDocs; ++i ) {
00590         if( !( i % 1000 ) )
00591             g_CatSleipnir( ).notice( "CSVMImpl::EvaluateFile( %s ) pair %d/%d", szFile, i,
00592                 iDocs );
00593         ifsm.read( (char*)ad, ( iWords + 1 ) * sizeof(*ad) );
00594         for( j = 0; j < iWords; ++j )
00595             asWords[ j ].weight = ad[ j + 1 ];
00596         pDoc = create_example( i, 0, 0, 1, create_svector( asWords, "", 1 ) );
00597         ifsm.read( (char*)&j, sizeof(j) );
00598         ifsm.read( (char*)&j, sizeof(j) );
00599         ifsm.read( (char*)&k, sizeof(k) );
00600         DatOut.Set( j, k, (float)( m_pModel->kernel_parm.kernel_type ?
00601             classify_example( m_pModel, pDoc ) :
00602             classify_example_linear( m_pModel, pDoc ) ) );
00603         free_example( pDoc, 1 ); }
00604     delete[] ad;
00605 
00606     if( asWords != s_asWords )
00607         delete[] asWords;
00608 
00609     return true; }
00610 
00627 bool CSVM::Open( std::istream& istm ) {
00628     static const size_t c_iBuf  = 131072;
00629     char            szBuf[ c_iBuf ];
00630     vector<string>  vecstrLine, vecstrToken;
00631     SVMPerf::WORD*  asWords;
00632     size_t          i, j;
00633 
00634     Reset( false, true, true );
00635     m_pModel = (MODEL*)calloc( 1, sizeof(*m_pModel) );
00636 
00637     istm.getline( szBuf, c_iBuf - 1 );
00638     istm >> m_pModel->kernel_parm.kernel_type;
00639     istm.getline( szBuf, c_iBuf - 1 );
00640     istm >> m_pModel->kernel_parm.poly_degree;
00641     istm.getline( szBuf, c_iBuf - 1 );
00642     istm >> m_pModel->kernel_parm.rbf_gamma;
00643     istm.getline( szBuf, c_iBuf - 1 );
00644     istm >> m_pModel->kernel_parm.coef_lin;
00645     istm.getline( szBuf, c_iBuf - 1 );
00646     istm >> m_pModel->kernel_parm.coef_const;
00647     istm.getline( szBuf, c_iBuf - 1 );
00648     istm.getline( szBuf, c_iBuf - 1 );
00649     CMeta::Tokenize( szBuf, vecstrLine, "#", true );
00650     if( vecstrLine.size( ) > 1 )
00651         strcpy_s( m_pModel->kernel_parm.custom, 49, vecstrLine[ 0 ].c_str( ) );
00652     istm >> m_pModel->totwords;
00653     istm.getline( szBuf, c_iBuf - 1 );
00654     istm >> m_pModel->totdoc;
00655     istm.getline( szBuf, c_iBuf - 1 );
00656     istm >> m_pModel->sv_num;
00657     istm.getline( szBuf, c_iBuf - 1 );
00658     istm >> m_pModel->b;
00659     istm.getline( szBuf, c_iBuf - 1 );
00660 
00661     m_pModel->supvec = (DOC**)malloc( m_pModel->sv_num * sizeof(*m_pModel->supvec) );
00662     m_pModel->alpha = (double*)malloc( m_pModel->sv_num * sizeof(*m_pModel->alpha) );
00663     m_pModel->index = NULL;
00664     m_pModel->lin_weights = NULL;
00665 
00666     asWords = new SVMPerf::WORD[ m_pModel->totwords + 1 ];
00667     asWords[ m_pModel->totwords ].wnum = 0;
00668     for( i = 1; i < (size_t)m_pModel->sv_num; ++i ) {
00669         istm.getline( szBuf, c_iBuf - 1 );
00670         szBuf[ c_iBuf - 1 ] = 0;
00671         vecstrLine.clear( );
00672         CMeta::Tokenize( szBuf, vecstrLine, CMeta::c_szWS, true );
00673         if( vecstrLine.size( ) != ( m_pModel->totwords + 2 ) ) {
00674             g_CatSleipnir( ).error( "CSVM::Open( ) wanted %d words but only found %d on line: %s",
00675                 ( m_pModel->totwords + 2 ), vecstrLine.size( ), szBuf );
00676             delete[] asWords;
00677             return false; }
00678         m_pModel->alpha[ i ] = atof( vecstrLine[ 0 ].c_str( ) );
00679         for( j = 1; ( j + 1 ) < vecstrLine.size( ); ++j ) {
00680             vecstrToken.clear( );
00681             CMeta::Tokenize( vecstrLine[ j ].c_str( ), vecstrToken, ":", true );
00682             if( vecstrToken.size( ) != 2 ) {
00683                 g_CatSleipnir( ).error( "CSVM::Open( ) found illegal token \"%s\" on line: %s",
00684                     vecstrLine[ j ].c_str( ), szBuf );
00685                 delete[] asWords;
00686                 return false; }
00687             asWords[ j - 1 ].wnum = atoi( vecstrToken[ 0 ].c_str( ) );
00688             asWords[ j - 1 ].weight = (float)atof( vecstrToken[ 1 ].c_str( ) ); }
00689         m_pModel->supvec[ i ] = create_example( -1, 0, 0, 0, create_svector( asWords, "",
00690             1 ) ); }
00691 
00692     delete[] asWords;
00693     return true; }
00694 
00695 }
00696 
00697 #endif // NO_SVM_PERF