Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "svm.h" 00024 #include "pclset.h" 00025 #include "dataset.h" 00026 #include "meta.h" 00027 #include "genome.h" 00028 00029 #ifndef NO_SVM_PERF 00030 00031 extern "C" { 00032 KERNEL_CACHE* kernel_cache_init( long, long ); 00033 void kernel_cache_cleanup( KERNEL_CACHE* ); 00034 void svm_learn_classification( DOC**, double*, long, long, LEARN_PARM*, KERNEL_PARM*, 00035 KERNEL_CACHE*, MODEL*, double* ); 00036 void svm_learn_regression( DOC**, double*, long, long, LEARN_PARM*, KERNEL_PARM*, 00037 KERNEL_CACHE**, MODEL* ); 00038 void svm_learn_ranking( DOC**, double*, long, long, LEARN_PARM*, KERNEL_PARM*, 00039 KERNEL_CACHE**, MODEL* ); 00040 void svm_learn_optimization( DOC**, double*, long, long, LEARN_PARM*, KERNEL_PARM*, 00041 KERNEL_CACHE*, MODEL*, double* ); 00042 } 00043 00044 namespace Sleipnir { 00045 00046 bool read_documents_bin( char* szFile, DOC*** papDocs, double** padLabels, 00047 uint32_t* piWords, uint32_t* piDocs ) { 00048 char szComment[ 1024 ]; 00049 FILE* pfileDoc; 00050 SVMPerf::WORD* aWords; 00051 uint32_t i, iDoc, iWord; 00052 float d; 00053 float* ad; 00054 00055 g_CatSleipnir( ).info( "CSVM::read_documents_bin( ) Reading binary examples into memory..." ); 00056 00057 #pragma warning( disable : 4996 ) 00058 if( !( pfileDoc = fopen( szFile, "rb" ) ) ) { 00059 #pragma warning( default : 4996 ) 00060 g_CatSleipnir( ).error( "CSVM::read_documents_bin( ) Could not open: %s", szFile ); 00061 return false; } 00062 00063 fread( piWords, sizeof(*piWords), 1, pfileDoc ); 00064 fread( piDocs, sizeof(*piDocs), 1, pfileDoc ); 00065 00066 (*papDocs) = (DOC**)my_malloc( sizeof(DOC*) * (*piDocs) ); 00067 (*padLabels) = (double*)my_malloc( sizeof(double) * (*piDocs) ); 00068 ad = (float*)my_malloc( sizeof(*ad) * (*piWords) ); 00069 aWords = (SVMPerf::WORD*)my_malloc( sizeof(SVMPerf::WORD) * ( (*piWords) + 1 ) ); 00070 for( iWord = 0; iWord < (*piWords); ++iWord ) 00071 aWords[ iWord ].wnum = iWord + 1; 00072 aWords[ iWord ].wnum = 0; 00073 for( iDoc = 0; iDoc < (*piDocs); ++iDoc ) { 00074 if( !( iDoc % 100000 ) ) 00075 g_CatSleipnir( ).info( "CSVM::read_documents_bin( ) Read %d/%d", iDoc, *piDocs ); 00076 fread( &d, sizeof(d), 1, pfileDoc ); 00077 (*padLabels)[ iDoc ] = d; 00078 fread( ad, sizeof(*ad), (*piWords), pfileDoc ); 00079 for( iWord = 0; iWord < (*piWords); ++iWord ) 00080 aWords[ iWord ].weight = ad[ iWord ]; 00081 fread( &i, sizeof(i), 1, pfileDoc ); 00082 if( i ) 00083 fread( szComment, sizeof(*szComment), i, pfileDoc ); 00084 szComment[ i ] = 0; 00085 (*papDocs)[ iDoc ] = create_example( iDoc, 0, 0, 1, create_svector( aWords, szComment, 1 ) ); } 00086 free( aWords ); 00087 free( ad ); 00088 00089 fclose( pfileDoc ); 00090 return true; 00091 } 00092 00093 SVMPerf::WORD CSVMImpl::s_asWords[ CSVMImpl::c_iWords ]; 00094 00095 CSVMImpl::SLearn::SLearn( ) { 00096 00097 predfile[ 0 ] = 0; 00098 alphafile[ 0 ] = 0; 00099 biased_hyperplane = 1; 00100 sharedslack = 0; 00101 remove_inconsistent = 0; 00102 skip_final_opt_check = 0; 00103 svm_maxqpsize = 10; 00104 svm_newvarsinqp = 0; 00105 svm_iter_to_shrink = -1; 00106 maxiter = 100000; 00107 kernel_cache_size = 40; 00108 svm_c = 0; 00109 eps = 0.1; 00110 transduction_posratio = -1.0; 00111 svm_costratio = 0; 00112 svm_costratio_unlab = 1; 00113 svm_unlabbound = 1e-5; 00114 epsilon_crit = 0.001; 00115 epsilon_a = 1e-15; 00116 compute_loo = 0; 00117 rho = 1; 00118 xa_depth = 0; 00119 type = CLASSIFICATION; } 00120 00121 CSVMImpl::SKernel::SKernel( ) { 00122 00123 kernel_type = 0; 00124 poly_degree = 3; 00125 rbf_gamma = 1; 00126 coef_lin = 1; 00127 coef_const = 1; 00128 custom[ 0 ] = 0; } 00129 00130 CSVMImpl::CSVMImpl( ) : m_apDocs(NULL), m_iDocs(0), m_adAlphas(NULL), m_iAlphas(0), 00131 m_pModel(NULL), m_adLabels(NULL) { 00132 00133 verbosity = 2; } 00134 00135 CSVMImpl::~CSVMImpl( ) { 00136 00137 Reset( true, true, true ); } 00138 00139 void CSVMImpl::Reset( bool fData, bool fModel, bool fAlphas ) { 00140 size_t i; 00141 00142 if( fModel && m_pModel ) { 00143 free_model( m_pModel, 0 ); 00144 m_pModel = NULL; } 00145 if( fAlphas && m_adAlphas ) { 00146 free( m_adAlphas ); 00147 m_adAlphas = NULL; } 00148 if( fData ) { 00149 if( m_apDocs ) { 00150 for( i = 0; i < m_iDocs; ++i ) 00151 free_example( m_apDocs[ i ], 1 ); 00152 delete[] m_apDocs; 00153 m_apDocs = NULL; } 00154 if( m_adLabels ) { 00155 delete[] m_adLabels; 00156 m_adLabels = NULL; } } } 00157 00158 size_t CSVMImpl::GetWords( const SData& sData ) const { 00159 size_t i, iRet; 00160 00161 switch( sData.m_eType ) { 00162 case SData::EPCLs: 00163 for( iRet = i = 0; i < sData.m_uData.m_pPCLs->GetPCLs( ); ++i ) 00164 iRet += sData.m_uData.m_pPCLs->Get( i ).GetExperiments( ); 00165 return ( iRet * 2 ); 00166 00167 case SData::EData: 00168 return sData.m_uData.m_pData->GetExperiments( ); 00169 00170 case SData::EFile: 00171 return m_iWords; 00172 00173 case SData::EPCL: 00174 return sData.m_uData.m_pPCL->GetExperiments( ); } 00175 00176 return -1; } 00177 00178 DOC* CSVMImpl::CreateDoc( const SData& sData, size_t iOne, size_t iTwo, size_t iDoc ) const { 00179 SVMPerf::WORD* asWords; 00180 size_t i, j, iWord, iWords; 00181 float d; 00182 DOC* pRet; 00183 00184 iWords = GetWords( sData ); 00185 asWords = ( iWords >= c_iWords ) ? new SVMPerf::WORD[ iWords + 1 ] : s_asWords; 00186 for( i = 0; i < iWords; ++i ) 00187 asWords[ i ].wnum = i + 1; 00188 asWords[ i ].wnum = 0; 00189 if( sData.m_eType == SData::EPCLs ) { 00190 const CPCLSet& PCLs = *sData.m_uData.m_pPCLs; 00191 00192 for( iWord = i = 0; i < PCLs.GetPCLs( ); ++i ) { 00193 for( j = 0; j < PCLs.Get( i ).GetExperiments( ); ++j ) { 00194 if( CMeta::IsNaN( d = PCLs.Get( i, iOne, j ) ) ) 00195 d = 0; 00196 assert( ( iWord + j ) < iWords ); 00197 asWords[ iWord + j ].weight = d; 00198 if( CMeta::IsNaN( d = PCLs.Get( i, iTwo, j ) ) ) 00199 d = 0; 00200 assert( ( iWord + ( iWords / 2 ) + j ) < iWords ); 00201 asWords[ iWord + ( iWords / 2 ) + j ].weight = d; } 00202 iWord += PCLs.Get( i ).GetExperiments( ); } } 00203 else { 00204 const IDataset* pData = sData.m_uData.m_pData; 00205 00206 for( i = 0; i < pData->GetExperiments( ); ++i ) { 00207 if( CMeta::IsNaN( d = pData->GetContinuous( iOne, iTwo, i ) ) ) 00208 d = 0; 00209 asWords[ i ].weight = d; } } 00210 00211 pRet = create_example( iDoc, 0, 0, 1, create_svector( asWords, "", 1 ) ); 00212 if( asWords != s_asWords ) 00213 delete[] asWords; 00214 return pRet; } 00215 00216 DOC* CSVMImpl::CreateDoc( const SData& sData, size_t iGene ) const { 00217 SVMPerf::WORD* asWords; 00218 size_t i, iWords; 00219 DOC* pRet; 00220 float d; 00221 00222 iWords = GetWords( sData ); 00223 asWords = ( iWords >= c_iWords ) ? new SVMPerf::WORD[ iWords + 1 ] : s_asWords; 00224 for( i = 0; i < iWords; ++i ) 00225 asWords[ i ].wnum = i + 1; 00226 asWords[ i ].wnum = 0; 00227 00228 for( i = 0; i < iWords; ++i ) 00229 asWords[ i ].weight = CMeta::IsNaN( d = sData.m_uData.m_pPCL->Get( iGene, i ) ) ? 0 : d; 00230 pRet = create_example( i, 0, 0, 1, create_svector( asWords, "", 1 ) ); 00231 00232 if( asWords != s_asWords ) 00233 delete[] asWords; 00234 return pRet; } 00235 00249 bool CSVM::OpenAlphas( std::istream& istm ) { 00250 static const size_t c_iBuf = 1024; 00251 char szBuf[ c_iBuf ]; 00252 vector<float> vecdAlphas; 00253 size_t i; 00254 00255 Reset( false, false, true ); 00256 while( istm.peek( ) != EOF ) { 00257 istm.getline( szBuf, c_iBuf - 1 ); 00258 vecdAlphas.push_back( (float)atof( szBuf ) ); } 00259 m_adAlphas = new double[ m_iAlphas = vecdAlphas.size( ) ]; 00260 for( i = 0; i < m_iAlphas; ++i ) 00261 m_adAlphas[ i ] = vecdAlphas[ i ]; 00262 00263 return true; } 00264 00265 bool CSVMImpl::Initialize( const SData& sData ) { 00266 size_t i, j, iOne, iTwo, iDoc; 00267 vector<size_t> veciGenes; 00268 float d; 00269 00270 Reset( true, false, false ); 00271 if( sData.m_eType == SData::EFile ) { 00272 read_documents_bin( (char*)sData.m_uData.m_szFile, &m_apDocs, &m_adLabels, 00273 &m_iWords, &m_iDocs ); 00274 return true; } 00275 if( sData.m_eType == SData::EPCL ) { 00276 for( m_iDocs = i = 0; i < sData.m_uData.m_pPCL->GetGenes( ); ++i ) 00277 if( !sData.m_uData.m_pPCL->IsMasked( i ) && ( !sData.m_pNegative || 00278 sData.m_uAnswers.m_pGenes->IsGene( sData.m_uData.m_pPCL->GetGene( i ) ) || 00279 sData.m_pNegative->IsGene( sData.m_uData.m_pPCL->GetGene( i ) ) ) ) 00280 m_iDocs++; 00281 m_apDocs = new DOC*[ m_iDocs ]; 00282 m_adLabels = new double[ m_iDocs ]; 00283 for( i = j = 0; i < sData.m_uData.m_pPCL->GetGenes( ); ++i ) 00284 if( !sData.m_uData.m_pPCL->IsMasked( i ) ) { 00285 const string& strGene = sData.m_uData.m_pPCL->GetGene( i ); 00286 00287 d = 0; 00288 if( !sData.m_pNegative ) 00289 d = sData.m_uAnswers.m_pGenes->IsGene( strGene ) ? 1.0f : -1.0f; 00290 else if( sData.m_uAnswers.m_pGenes->IsGene( strGene ) ) 00291 d = 1; 00292 else if( sData.m_pNegative->IsGene( strGene ) ) 00293 d = -1; 00294 if( d ) { 00295 m_apDocs[ j ] = CreateDoc( sData, i ); 00296 m_adLabels[ j++ ] = d; } } 00297 return true; } 00298 00299 veciGenes.resize( ( sData.m_eType == SData::EPCLs ) ? 00300 sData.m_uData.m_pPCLs->GetGenes( ) : sData.m_uData.m_pData->GetGenes( ) ); 00301 for( i = 0; i < veciGenes.size( ); ++i ) 00302 veciGenes[ i ] = sData.m_uAnswers.m_pAnswers->GetGene( ( sData.m_eType == SData::EPCLs ) ? 00303 sData.m_uData.m_pPCLs->GetGene( i ) : sData.m_uData.m_pData->GetGene( i ) ); 00304 for( m_iDocs = i = 0; i < veciGenes.size( ); ++i ) 00305 if( ( iOne = veciGenes[ i ] ) != -1 ) 00306 for( j = ( i + 1 ); j < veciGenes.size( ); ++j ) 00307 if( ( ( iTwo = veciGenes[ j ] ) != -1 ) && 00308 !CMeta::IsNaN( sData.m_uAnswers.m_pAnswers->Get( iOne, iTwo ) ) ) 00309 m_iDocs++; 00310 m_apDocs = new DOC*[ m_iDocs ]; 00311 m_adLabels = new double[ m_iDocs ]; 00312 00313 for( iDoc = i = 0; i < veciGenes.size( ); ++i ) 00314 if( ( iOne = veciGenes[ i ] ) != -1 ) 00315 for( j = ( i + 1 ); j < veciGenes.size( ); ++j ) 00316 if( ( ( iTwo = veciGenes[ j ] ) != -1 ) && 00317 !CMeta::IsNaN( d = sData.m_uAnswers.m_pAnswers->Get( iOne, iTwo ) ) ) { 00318 m_adLabels[ iDoc ] = d ? 1 : -1; 00319 m_apDocs[ iDoc++ ] = CreateDoc( sData, i, j, iDoc ); } 00320 assert( iDoc == m_iDocs ); 00321 00322 return true; } 00323 00341 bool CSVM::Learn( const CPCL& PCL, const CGenes& GenesPositive ) { 00342 CGenes GenesNeg( GenesPositive.GetGenome( ) ); 00343 00344 return Learn( PCL, GenesPositive, GenesNeg ); } 00345 00366 bool CSVM::Learn( const CPCL& PCL, const CGenes& GenesPositive, const CGenes& GenesNegative ) { 00367 SData sData; 00368 00369 sData.m_eType = SData::EPCL; 00370 sData.m_uData.m_pPCL = &PCL; 00371 sData.m_uAnswers.m_pGenes = &GenesPositive; 00372 sData.m_pNegative = GenesNegative.GetGenes( ) ? &GenesNegative : NULL; 00373 00374 return CSVMImpl::Learn( sData ); } 00375 00376 bool CSVMImpl::Learn( const SData& sData ) { 00377 KERNEL_CACHE* pCache; 00378 size_t i, iNeg, iPos, iWords; 00379 00380 Reset( false, true, false ); 00381 m_pModel = (MODEL*)calloc( 1, sizeof(*m_pModel) ); 00382 if( !Initialize( sData ) ) 00383 return false; 00384 if( !m_sLearn.svm_costratio ) { 00385 for( iNeg = iPos = i = 0; i < m_iDocs; ++i ) 00386 if( m_adLabels[ i ] == 1 ) 00387 iPos++; 00388 else 00389 iNeg++; 00390 m_sLearn.svm_costratio = (float)iNeg / iPos; } 00391 if( m_sLearn.svm_iter_to_shrink < 0 ) 00392 m_sLearn.svm_iter_to_shrink = ( m_sKernel.kernel_type == LINEAR ) ? 2 : 100; 00393 iWords = GetWords( sData ); 00394 00395 pCache = ( m_sKernel.kernel_type == LINEAR ) ? NULL : 00396 kernel_cache_init( m_iDocs, m_sLearn.kernel_cache_size ); 00397 switch( m_sLearn.type ) { 00398 case CLASSIFICATION: 00399 svm_learn_classification( m_apDocs, m_adLabels, m_iDocs, iWords, 00400 (LEARN_PARM*)&m_sLearn, (KERNEL_PARM*)&m_sKernel, pCache, m_pModel, 00401 m_adAlphas ); 00402 break; 00403 00404 case REGRESSION: 00405 svm_learn_regression( m_apDocs, m_adLabels, m_iDocs, iWords, 00406 (LEARN_PARM*)&m_sLearn, (KERNEL_PARM*)&m_sKernel, &pCache, m_pModel ); 00407 break; 00408 00409 case RANKING: 00410 svm_learn_ranking( m_apDocs, m_adLabels, m_iDocs, iWords, (LEARN_PARM*)&m_sLearn, 00411 (KERNEL_PARM*)&m_sKernel, &pCache, m_pModel ); 00412 break; 00413 00414 case OPTIMIZATION: 00415 svm_learn_optimization( m_apDocs, m_adLabels, m_iDocs, iWords, 00416 (LEARN_PARM*)&m_sLearn, (KERNEL_PARM*)&m_sKernel, pCache, m_pModel, 00417 m_adAlphas ); 00418 break; } 00419 00420 if( pCache ) 00421 kernel_cache_cleanup( pCache ); 00422 00423 return true; } 00424 00441 bool CSVM::Save( std::ostream& ostm ) const { 00442 size_t i, j; 00443 SVECTOR* pVec; 00444 00445 if( !m_pModel ) 00446 return false; 00447 00448 ostm << "SVM-light Version " << VERSION << endl; 00449 ostm << m_pModel->kernel_parm.kernel_type << " # kernel type" << endl; 00450 ostm << m_pModel->kernel_parm.poly_degree << " # kernel parameter -d" << endl; 00451 ostm << m_pModel->kernel_parm.rbf_gamma << " # kernel parameter -g" << endl; 00452 ostm << m_pModel->kernel_parm.coef_lin << " # kernel parameter -s" << endl; 00453 ostm << m_pModel->kernel_parm.coef_const << " # kernel parameter -r" << endl; 00454 ostm << m_pModel->kernel_parm.custom << "# kernel parameter -u" << endl; 00455 ostm << m_pModel->totwords << " # highest feature index" << endl; 00456 ostm << m_pModel->totdoc << " # number of training documents" << endl; 00457 00458 for( i = j = 1; i < (size_t)m_pModel->sv_num; ++i ) 00459 for( pVec = m_pModel->supvec[ i ]->fvec; pVec; pVec = pVec->next ) 00460 j++; 00461 ostm << j << " # number of support vectors plus 1" << endl; 00462 ostm << m_pModel->b << 00463 " # threshold b, each following line is a SV (starting with alpha*y)" << endl; 00464 00465 for( i = 1; i < (size_t)m_pModel->sv_num; ++i ) 00466 for( pVec = m_pModel->supvec[ i ]->fvec; pVec; pVec = pVec->next ) { 00467 ostm << ( m_pModel->alpha[ i ] * pVec->factor ) << ' '; 00468 for( j = 0; pVec->words[ j ].wnum; ++j ) 00469 ostm << pVec->words[ j ].wnum << ':' << pVec->words[ j ].weight << ' '; 00470 ostm << '#' << endl; } 00471 // ostm << '#' << pVec->userdefined << endl; } 00472 00473 return true; } 00474 00475 bool CSVMImpl::Evaluate( const SData& sData, const CGenes* pGenesIn, CDat& DatOut ) const { 00476 size_t i, j, iGenes; 00477 DOC* pDoc; 00478 00479 if( !m_pModel ) 00480 return false; 00481 if( m_pModel->kernel_parm.kernel_type == 0 ) 00482 add_weight_vector_to_linear_model( m_pModel ); 00483 00484 if( sData.m_eType == SData::EFile ) 00485 return EvaluateFile( sData.m_uData.m_szFile, DatOut ); 00486 00487 iGenes = ( sData.m_eType == SData::EPCLs ) ? sData.m_uData.m_pPCLs->GetGenes( ) : 00488 sData.m_uData.m_pData->GetGenes( ); 00489 for( i = 0; i < iGenes; ++i ) { 00490 const string& strGeneOne = ( sData.m_eType == SData::EPCLs ) ? 00491 sData.m_uData.m_pPCLs->GetGene( i ) : sData.m_uData.m_pData->GetGene( i ); 00492 00493 if( !( i % 10 ) ) 00494 g_CatSleipnir( ).notice( "CSVMImpl::Evaluate( ) gene %d/%d", i, iGenes ); 00495 if( pGenesIn && !pGenesIn->IsGene( strGeneOne ) ) 00496 continue; 00497 for( j = ( i + 1 ); j < iGenes; ++j ) { 00498 const string& strGeneTwo = ( sData.m_eType == SData::EPCLs ) ? 00499 sData.m_uData.m_pPCLs->GetGene( j ) : sData.m_uData.m_pData->GetGene( j ); 00500 00501 if( pGenesIn && !pGenesIn->IsGene( strGeneTwo ) ) 00502 continue; 00503 if( !( pDoc = CreateDoc( sData, i, j, 0 ) ) ) 00504 return false; 00505 DatOut.Set( i, j, (float)( m_pModel->kernel_parm.kernel_type ? 00506 classify_example( m_pModel, pDoc ) : 00507 classify_example_linear( m_pModel, pDoc ) ) ); 00508 free_example( pDoc, 1 ); } } 00509 00510 return true; } 00511 00527 bool CSVM::Evaluate( const CPCL& PCL, vector<float>& vecdResults ) const { 00528 size_t i; 00529 DOC* pDoc; 00530 SData sData; 00531 00532 if( !m_pModel ) 00533 return false; 00534 if( m_pModel->kernel_parm.kernel_type == 0 ) 00535 add_weight_vector_to_linear_model( m_pModel ); 00536 00537 sData.m_eType = SData::EPCL; 00538 sData.m_uData.m_pPCL = &PCL; 00539 for( i = 0; i < PCL.GetGenes( ); ++i ) { 00540 if( !( i % 1000 ) ) 00541 g_CatSleipnir( ).notice( "CSVMImpl::Evaluate( ) gene %d/%d", i, PCL.GetGenes( ) ); 00542 if( PCL.IsMasked( i ) ) 00543 continue; 00544 00545 if( !( pDoc = CreateDoc( sData, i ) ) ) 00546 return false; 00547 vecdResults.push_back( (float)( m_pModel->kernel_parm.kernel_type ? 00548 classify_example( m_pModel, pDoc ) : 00549 classify_example_linear( m_pModel, pDoc ) ) ); 00550 free_example( pDoc, 1 ); } 00551 00552 return true; } 00553 00554 bool CSVMImpl::EvaluateFile( const char* szFile, CDat& DatOut ) const { 00555 static const size_t c_iSize = 512; 00556 char szGene[ c_iSize ]; 00557 SVMPerf::WORD* asWords; 00558 char* pc; 00559 ifstream ifsm; 00560 vector<string> vecstrGenes; 00561 uint32_t i, j, k, iDocs, iWords, iGenes; 00562 float* ad; 00563 DOC* pDoc; 00564 00565 ifsm.open( szFile, ios_base::binary ); 00566 if( !ifsm.is_open( ) ) 00567 return false; 00568 ifsm.read( (char*)&iWords, sizeof(iWords) ); 00569 ifsm.read( (char*)&iDocs, sizeof(iDocs) ); 00570 ifsm.seekg( iDocs * ( ( ( iWords + 1 ) * sizeof(float) ) + 00571 ( 3 * sizeof(iDocs) ) ), ios_base::cur ); 00572 ifsm.read( (char*)&iGenes, sizeof(iGenes) ); 00573 vecstrGenes.resize( iGenes ); 00574 for( i = 0; i < iGenes; ++i ) { 00575 for( pc = szGene; ; ++pc ) { 00576 ifsm.read( pc, 1 ); 00577 if( !*pc ) 00578 break; } 00579 vecstrGenes[ i ] = szGene; } 00580 DatOut.Open( vecstrGenes ); 00581 00582 asWords = ( iWords >= c_iWords ) ? new SVMPerf::WORD[ iWords + 1 ] : s_asWords; 00583 for( i = 0; i < iWords; ++i ) 00584 asWords[ i ].wnum = i + 1; 00585 asWords[ i ].wnum = 0; 00586 00587 ad = new float[ iWords + 1 ]; 00588 ifsm.seekg( 2 * sizeof(iDocs), ios_base::beg ); 00589 for( i = 0; i < iDocs; ++i ) { 00590 if( !( i % 1000 ) ) 00591 g_CatSleipnir( ).notice( "CSVMImpl::EvaluateFile( %s ) pair %d/%d", szFile, i, 00592 iDocs ); 00593 ifsm.read( (char*)ad, ( iWords + 1 ) * sizeof(*ad) ); 00594 for( j = 0; j < iWords; ++j ) 00595 asWords[ j ].weight = ad[ j + 1 ]; 00596 pDoc = create_example( i, 0, 0, 1, create_svector( asWords, "", 1 ) ); 00597 ifsm.read( (char*)&j, sizeof(j) ); 00598 ifsm.read( (char*)&j, sizeof(j) ); 00599 ifsm.read( (char*)&k, sizeof(k) ); 00600 DatOut.Set( j, k, (float)( m_pModel->kernel_parm.kernel_type ? 00601 classify_example( m_pModel, pDoc ) : 00602 classify_example_linear( m_pModel, pDoc ) ) ); 00603 free_example( pDoc, 1 ); } 00604 delete[] ad; 00605 00606 if( asWords != s_asWords ) 00607 delete[] asWords; 00608 00609 return true; } 00610 00627 bool CSVM::Open( std::istream& istm ) { 00628 static const size_t c_iBuf = 131072; 00629 char szBuf[ c_iBuf ]; 00630 vector<string> vecstrLine, vecstrToken; 00631 SVMPerf::WORD* asWords; 00632 size_t i, j; 00633 00634 Reset( false, true, true ); 00635 m_pModel = (MODEL*)calloc( 1, sizeof(*m_pModel) ); 00636 00637 istm.getline( szBuf, c_iBuf - 1 ); 00638 istm >> m_pModel->kernel_parm.kernel_type; 00639 istm.getline( szBuf, c_iBuf - 1 ); 00640 istm >> m_pModel->kernel_parm.poly_degree; 00641 istm.getline( szBuf, c_iBuf - 1 ); 00642 istm >> m_pModel->kernel_parm.rbf_gamma; 00643 istm.getline( szBuf, c_iBuf - 1 ); 00644 istm >> m_pModel->kernel_parm.coef_lin; 00645 istm.getline( szBuf, c_iBuf - 1 ); 00646 istm >> m_pModel->kernel_parm.coef_const; 00647 istm.getline( szBuf, c_iBuf - 1 ); 00648 istm.getline( szBuf, c_iBuf - 1 ); 00649 CMeta::Tokenize( szBuf, vecstrLine, "#", true ); 00650 if( vecstrLine.size( ) > 1 ) 00651 strcpy_s( m_pModel->kernel_parm.custom, 49, vecstrLine[ 0 ].c_str( ) ); 00652 istm >> m_pModel->totwords; 00653 istm.getline( szBuf, c_iBuf - 1 ); 00654 istm >> m_pModel->totdoc; 00655 istm.getline( szBuf, c_iBuf - 1 ); 00656 istm >> m_pModel->sv_num; 00657 istm.getline( szBuf, c_iBuf - 1 ); 00658 istm >> m_pModel->b; 00659 istm.getline( szBuf, c_iBuf - 1 ); 00660 00661 m_pModel->supvec = (DOC**)malloc( m_pModel->sv_num * sizeof(*m_pModel->supvec) ); 00662 m_pModel->alpha = (double*)malloc( m_pModel->sv_num * sizeof(*m_pModel->alpha) ); 00663 m_pModel->index = NULL; 00664 m_pModel->lin_weights = NULL; 00665 00666 asWords = new SVMPerf::WORD[ m_pModel->totwords + 1 ]; 00667 asWords[ m_pModel->totwords ].wnum = 0; 00668 for( i = 1; i < (size_t)m_pModel->sv_num; ++i ) { 00669 istm.getline( szBuf, c_iBuf - 1 ); 00670 szBuf[ c_iBuf - 1 ] = 0; 00671 vecstrLine.clear( ); 00672 CMeta::Tokenize( szBuf, vecstrLine, CMeta::c_szWS, true ); 00673 if( vecstrLine.size( ) != ( m_pModel->totwords + 2 ) ) { 00674 g_CatSleipnir( ).error( "CSVM::Open( ) wanted %d words but only found %d on line: %s", 00675 ( m_pModel->totwords + 2 ), vecstrLine.size( ), szBuf ); 00676 delete[] asWords; 00677 return false; } 00678 m_pModel->alpha[ i ] = atof( vecstrLine[ 0 ].c_str( ) ); 00679 for( j = 1; ( j + 1 ) < vecstrLine.size( ); ++j ) { 00680 vecstrToken.clear( ); 00681 CMeta::Tokenize( vecstrLine[ j ].c_str( ), vecstrToken, ":", true ); 00682 if( vecstrToken.size( ) != 2 ) { 00683 g_CatSleipnir( ).error( "CSVM::Open( ) found illegal token \"%s\" on line: %s", 00684 vecstrLine[ j ].c_str( ), szBuf ); 00685 delete[] asWords; 00686 return false; } 00687 asWords[ j - 1 ].wnum = atoi( vecstrToken[ 0 ].c_str( ) ); 00688 asWords[ j - 1 ].weight = (float)atof( vecstrToken[ 1 ].c_str( ) ); } 00689 m_pModel->supvec[ i ] = create_example( -1, 0, 0, 0, create_svector( asWords, "", 00690 1 ) ); } 00691 00692 delete[] asWords; 00693 return true; } 00694 00695 } 00696 00697 #endif // NO_SVM_PERF