Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * Changes made Jun 2012 by Qian Zhu 00017 * 00018 * If you use this library, the included executable tools, or any related 00019 * code in your work, please cite the following publication: 00020 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00021 * Olga G. Troyanskaya. 00022 * "The Sleipnir library for computational functional genomics" 00023 *****************************************************************************/ 00024 #include "stdafx.h" 00025 #include "database.h" 00026 #include "meta.h" 00027 #include "bayesnetint.h" 00028 #include "datapair.h" 00029 00030 namespace Sleipnir { 00031 00032 const char CDatabaseImpl::c_acDAB[] = ".dab"; 00033 const char CDatabaseImpl::c_acQDAB[] = ".qdab"; 00034 const char CDatabaseImpl::c_acExtension[] = ".db"; 00035 00037 // CDatabaselet 00039 00040 CDatabaselet::CDatabaselet( bool useNibble) { 00041 m_useNibble = useNibble; 00042 m_pmutx = new pthread_mutex_t( ); 00043 pthread_mutex_init( m_pmutx, NULL ); 00044 } 00045 00046 CDatabaselet::~CDatabaselet( ) { 00047 00048 pthread_mutex_destroy( m_pmutx ); 00049 delete m_pmutx; 00050 if(m_fstm.is_open()){ 00051 m_fstm.close(); 00052 } 00053 } 00054 00055 /* original method for initializing databaselets, including writing header + pre-allocation */ 00056 bool CDatabaselet::Open( const std::string& strFile, const std::vector<std::string>& vecstrGenes, 00057 uint32_t iGenes, uint32_t iDatasets ) { 00058 uint32_t iSize; 00059 size_t i; 00060 char* acFiller; 00061 00062 m_fstm.clear( ); 00063 /* Open with overwriting */ 00064 m_fstm.open( strFile.c_str( ), ios_base::in | ios_base::out | ios_base::binary | ios_base::trunc ); 00065 00066 if( !m_fstm.is_open( ) ) { 00067 g_CatSleipnir( ).error( "CDatabaselet::Open( %s, %u, %u ) open failed", strFile.c_str( ), iGenes, 00068 iDatasets ); 00069 return false; } 00070 00071 00072 m_iGenes = iGenes; 00073 m_iDatasets = iDatasets; 00074 m_vecstrGenes.resize( vecstrGenes.size( ) ); 00075 copy( vecstrGenes.begin( ), vecstrGenes.end( ), m_vecstrGenes.begin( ) ); 00076 00077 //allocate space 00078 m_fstm.write( (char*)&m_iHeader, sizeof(m_iHeader) ); 00079 m_fstm.write( (char*)&m_iGenes, sizeof(m_iGenes) ); 00080 m_fstm.write( (char*)&m_iDatasets, sizeof(m_iDatasets) ); 00081 00082 iSize = m_vecstrGenes.size( ); 00083 00084 m_fstm.write((char*)&iSize, sizeof(iSize)); 00085 00086 //write gene-name for only the genes in the databaselets 00087 m_iHeader = sizeof(m_iHeader) + sizeof(m_iGenes) + sizeof(m_iDatasets) + sizeof(iSize); 00088 for( i = 0; i < m_vecstrGenes.size( ); ++i ) { 00089 m_fstm.write( m_vecstrGenes[ i ].c_str( ), m_vecstrGenes[ i ].size( ) + 1); 00090 m_iHeader += m_vecstrGenes[ i ].size( ) + 1; 00091 } 00092 00093 m_fstm.seekp( 0, ios_base::beg); 00094 m_fstm.write( (char*)&m_iHeader, sizeof(m_iHeader) ); 00095 00096 //pre-allocations 00097 m_fstm.seekp( m_iHeader, ios_base::beg ); 00098 acFiller = new char[ GetSizeGene( ) ]; 00099 memset( acFiller, -1, GetSizeGene( ) ); 00100 for( i = 0; i < m_vecstrGenes.size( ); ++i ){ 00101 m_fstm.write( acFiller, GetSizeGene( ) ); 00102 } 00103 delete[] acFiller; 00104 SetFile(strFile); 00105 00106 return true; 00107 00108 } 00109 00110 bool CDatabaselet::Write(char* Qt, const size_t &iSize, const size_t offset){ 00111 m_fstm.seekg(m_iHeader + offset, ios_base::beg); 00112 m_fstm.write(Qt, iSize); 00113 } 00114 00115 /* simply opens the file without overwriting */ 00116 bool CDatabaselet::OpenNoOverwrite() { 00117 m_fstm.clear( ); 00118 /* open without overwriting */ 00119 m_fstm.open( strFileName.c_str( ), ios_base::in | ios_base::out | ios_base::binary); 00120 00121 if( !m_fstm.is_open( ) ) { 00122 g_CatSleipnir( ).error( "CDatabaselet::Open( %s ) open failed", strFileName.c_str( )); 00123 return false; 00124 } 00125 return true; 00126 } 00127 00128 bool CDatabaselet::OpenWrite( unsigned char bValue, size_t iOffset, ENibbles eNibbles, 00129 unsigned char* abImage ) { 00130 unsigned char b; 00131 00132 if(m_useNibble==0){ 00133 eNibbles = ENibblesBoth; 00134 } 00135 00136 if( abImage ) 00137 iOffset -= m_iHeader; 00138 if( eNibbles != ENibblesBoth ) { 00139 if( abImage ) 00140 b = abImage[ iOffset ]; 00141 else { 00142 m_fstm.seekg( iOffset, ios_base::beg); 00143 b = m_fstm.get( ); 00144 } 00145 } 00146 00147 switch( eNibbles ) { 00148 case ENibblesLow: 00149 b = ( bValue & 0xF ) | ( b & 0xF0 ); 00150 break; 00151 00152 case ENibblesHigh: 00153 b = ( b & 0xF ) | ( bValue << 4 ); 00154 break; 00155 00156 case ENibblesBoth: 00157 b = bValue; 00158 break; 00159 } 00160 if( abImage ) 00161 abImage[ iOffset ] = b; 00162 else { 00163 m_fstm.seekp( iOffset, ios_base::beg ); 00164 m_fstm.put( b ); 00165 } 00166 00167 return true; } 00168 00169 /* original file writing method */ 00170 bool CDatabaselet::Open( const vector<CCompactFullMatrix>& vecData, size_t iBaseGenes, size_t iBaseDatasets, bool fBuffer ) { 00171 unsigned char* abImage; 00172 size_t iSize, iDatum, iGeneOne, iGeneTwo; 00173 unsigned char bOne, bTwo; 00174 00175 if( fBuffer ) { 00176 //iBaseGenes: gene id of first gene in each databaselet 00177 //iDataset: dataset id 00178 //printf("Number: %d %d %d %d\n", GetSizeGene(), GetSizePair(), iBaseGenes, iBaseDatasets); 00179 abImage = new unsigned char[ iSize = ( GetSizeGene( ) * m_vecstrGenes.size( ) ) ]; 00180 m_fstm.seekg( m_iHeader, ios_base::beg ); 00181 m_fstm.read( (char*)abImage, iSize ); 00182 } 00183 else 00184 abImage = NULL; 00185 00186 //vecData: # of genes in databaselet x # of genes user's list 00187 //if this is not the first dataset in the dataset block 00188 if( iBaseDatasets % 2 ){ 00189 //iGeneOne: iterate over all genes in this databaselet (# of genes in each databaselet) 00190 for( iGeneOne = 0; iGeneOne < GetGenes( ); ++iGeneOne ){ 00191 //iGeneTwo: iterate overall genes in user's gene list 00192 for( iGeneTwo = 0; iGeneTwo < vecData[ 0 ].GetColumns( ); ++iGeneTwo ){ 00193 //bOne, get the value of the gene located at the position (iBaseGene + iGeneOne, iGeneTwo) 00194 if( bOne = vecData[ 0 ].Get( iBaseGenes + iGeneOne, iGeneTwo ) ){ 00195 //Offset is: m_iHeader + (GetSizeGene() * iOne) + (GetSizePair() * iTwo) + iDataset (for byte case) 00196 OpenWrite( bOne - 1, GetOffset( iGeneOne, iGeneTwo, iBaseDatasets ), ENibblesHigh, abImage ); 00197 } 00198 } 00199 } 00200 } 00201 00202 for( iDatum = ( iBaseDatasets % 2 ); ( iDatum + 1 ) < vecData.size( ); iDatum += 2 ){ 00203 for( iGeneOne = 0; iGeneOne < GetGenes( ); ++iGeneOne ){ 00204 for( iGeneTwo = 0; iGeneTwo < vecData[ iDatum ].GetColumns( ); ++iGeneTwo ) { 00205 bOne = vecData[ iDatum ].Get( iBaseGenes + iGeneOne, iGeneTwo ); 00206 bTwo = vecData[ iDatum + 1 ].Get( iBaseGenes + iGeneOne, iGeneTwo ); 00207 if( !( bOne || bTwo ) ) 00208 continue; 00209 bOne -= 1; 00210 bTwo -= 1; 00211 if(m_useNibble){ 00212 OpenWrite( ( bOne & 0xF ) | ( bTwo << 4 ), GetOffset( iGeneOne, iGeneTwo, iBaseDatasets + 00213 iDatum ), ENibblesBoth, abImage ); 00214 }else{ 00215 OpenWrite( bOne, GetOffset( iGeneOne, iGeneTwo, iBaseDatasets + iDatum ), ENibblesBoth, 00216 abImage ); 00217 OpenWrite( bTwo, GetOffset( iGeneOne, iGeneTwo, iBaseDatasets + iDatum + 1 ), ENibblesBoth, 00218 abImage ); 00219 } 00220 } 00221 } 00222 } 00223 00224 if( iDatum < vecData.size( ) ){ 00225 for( iGeneOne = 0; iGeneOne < GetGenes( ); ++iGeneOne ){ 00226 for( iGeneTwo = 0; iGeneTwo < vecData[ iDatum ].GetColumns( ); ++iGeneTwo ){ 00227 if( bOne = vecData[ iDatum ].Get( iBaseGenes + iGeneOne, iGeneTwo ) ){ 00228 OpenWrite( bOne - 1, GetOffset( iGeneOne, iGeneTwo, iBaseDatasets + iDatum ), ENibblesLow, 00229 abImage ); 00230 } 00231 } 00232 } 00233 } 00234 if( fBuffer ) { 00235 m_fstm.seekp( m_iHeader, ios_base::beg ); 00236 m_fstm.write( (char*)abImage, iSize ); 00237 delete[] abImage; 00238 } 00239 00240 return true; } 00241 00242 bool CDatabase::GetGene(const string &strGene, vector<unsigned char> &vecbData) const{ 00243 size_t iGene = this->GetGene(strGene); 00244 if(iGene==-1){ 00245 cerr << "Gene not found" << endl; 00246 return false; 00247 } 00248 m_vecpDBs[ iGene % m_vecpDBs.size( ) ]->Get( iGene / m_vecpDBs.size( ), vecbData); 00249 return true; 00250 } 00251 00252 bool CDatabase::GetGene(const size_t &iGene, vector<unsigned char> &vecbData) const{ 00253 m_vecpDBs[ iGene % m_vecpDBs.size( ) ]->Get( iGene / m_vecpDBs.size( ), vecbData); 00254 return true; 00255 } 00256 00257 /* mainly used by SeekMiner */ 00258 bool CDatabaselet::Get(size_t iOne, vector<unsigned char>& vecbData){ 00259 size_t iSize; 00260 size_t i, j; 00261 size_t offset1 = GetOffset(iOne); 00262 if(this->m_fstm.is_open()){ 00263 cerr << "file is already opened" << endl; 00264 }else{ 00265 m_fstm.clear( ); 00266 m_fstm.open( this->strFileName.c_str( ), ios_base::binary | ios_base::in ); 00267 } 00268 00269 this->m_fstm.seekg(offset1, ios_base::beg); 00270 unsigned char *abImage = (unsigned char*)malloc( iSize = GetSizeGene()); 00271 this->m_fstm.read((char*)abImage, iSize); 00272 00273 m_fstm.close(); 00274 00275 if(m_useNibble==false){ 00276 vecbData.clear(); 00277 vecbData.resize(iSize); 00278 for(i=0; i<iSize; i++){ 00279 vecbData[i] = abImage[i]; 00280 } 00281 }else{ 00282 vecbData.clear(); 00283 vecbData.resize(m_iDatasets*m_iGenes); 00284 00285 for(j=0; j<m_iGenes; j++){ 00286 for(i=0; i<GetSizePair(); i++){ 00287 size_t offset = GetOffset(iOne, j) - m_iHeader; 00288 unsigned char b = abImage[offset + i]; 00289 unsigned char bValue = -1; 00290 if( ( bValue = ( b & 0xF ) ) == 0xF ){ 00291 bValue = -1; 00292 } 00293 vecbData[ j * m_iDatasets + 2 * i ] = bValue; 00294 00295 if( ( bValue = ( ( b >> 4 ) & 0xF ) ) == 0xF ){ 00296 bValue = -1; 00297 } 00298 00299 if((2 * i + 1)==m_iDatasets){ 00300 break; 00301 } 00302 vecbData[ j * m_iDatasets + (2 * i) + 1 ] = bValue; 00303 } 00304 } 00305 } 00306 free(abImage); 00307 return true; 00308 } 00309 00310 bool CDatabaselet::Get( size_t iOne, size_t iTwo, 00311 vector<unsigned char>& vecbData, unsigned char *charImage){ 00312 size_t offset = GetOffset(iOne, iTwo) - m_iHeader; 00313 return Get(offset, vecbData, charImage); 00314 } 00315 00316 bool CDatabaselet::Get(size_t offset, vector<unsigned char>& vecbData, unsigned char *charImage){ 00317 size_t i; 00318 00319 if(this->m_useNibble==false){ 00320 vecbData.clear(); 00321 vecbData.resize(GetSizePair()); 00322 00323 for(i=0; i<vecbData.size(); i++){ 00324 vecbData[i] = charImage[offset + i]; 00325 } 00326 }else{ 00327 vecbData.clear(); 00328 vecbData.resize(m_iDatasets); 00329 00330 00331 for(i=0; i<GetSizePair(); i++){ 00332 unsigned char b = charImage[offset + i]; 00333 unsigned char bValue = -1; 00334 if( ( bValue = ( b & 0xF ) ) == 0xF ){ 00335 bValue = -1; 00336 } 00337 vecbData[ 2 * i ] = bValue; 00338 00339 if( ( bValue = ( ( b >> 4 ) & 0xF ) ) == 0xF ){ 00340 bValue = -1; 00341 } 00342 00343 if((2 * i + 1)==m_iDatasets){ 00344 break; 00345 } 00346 vecbData[ (2 * i) + 1 ] = bValue; 00347 } 00348 } 00349 00350 return true; 00351 } 00352 00353 /* static function, combine multiple databaselets (that share the same genes, ie m_vecStrGenes), 00354 and output result to a single file, or output one-gene per file (if databaselet contains multiple genes) 00355 bSplit: whether or not to output one-gene per file 00356 Works for both nibble and byte 00357 */ 00358 bool CDatabaselet::Combine(std::vector<CDatabaselet*>& vecDatabaselet, 00359 std::string strOutDirectory, vector<string> &vecstrGenes, bool bSplit){ 00360 00361 /* for checking on consistency of databaselets */ 00362 bool bIsConsistent = true; 00363 bool fUseNibble; 00364 00365 size_t i, j; 00366 uint32_t iGenes, iDatasets; 00367 00368 CDatabaselet *first = vecDatabaselet[0]; 00369 fUseNibble = first->m_useNibble; 00370 00371 iGenes = first->GetGenes(); 00372 iDatasets = first->GetDatasets(); 00373 00374 vector<string> vecGenes; 00375 vecGenes.resize(iGenes); 00376 00377 for(i=0; i<iGenes; i++){ 00378 vecGenes[i] = first->GetGene(i); 00379 } 00380 00381 for(i=1; bIsConsistent && i<vecDatabaselet.size(); i++){ 00382 if(iGenes!=vecDatabaselet[i]->GetGenes() || fUseNibble!=vecDatabaselet[i]->m_useNibble){ 00383 bIsConsistent = false; 00384 break; 00385 } 00386 for(j=0; j<iGenes; j++){ 00387 if(vecGenes[j]!=vecDatabaselet[i]->GetGene(j)){ 00388 bIsConsistent = false; 00389 break; 00390 } 00391 } 00392 iDatasets+=vecDatabaselet[i]->GetDatasets(); 00393 } 00394 00395 if(!bIsConsistent){ 00396 cerr << "Databaselets are not consistent!" << endl; 00397 return false; 00398 } 00399 00400 /* adjustable parameter */ 00401 int BUFFER = 80; 00402 00403 if(BUFFER>iGenes){ 00404 BUFFER = iGenes; 00405 } 00406 00407 00408 vector<size_t> sizes; 00409 int numTimes = iGenes / BUFFER; 00410 for(i=0; i<numTimes; i++){ 00411 sizes.push_back(BUFFER); 00412 } 00413 if(iGenes % BUFFER > 0){ 00414 numTimes++; 00415 sizes.push_back(iGenes % BUFFER); 00416 } 00417 00418 size_t bb = 0; 00419 size_t sofar = 0; 00420 fprintf(stderr, "Number of times: %d\n", numTimes); 00421 for(bb=0; bb<numTimes; sofar+=sizes[bb], bb++){ 00422 fprintf(stderr, "bb sizes[bb] sofar: %d %d %d\n", bb, sizes[bb], sofar); 00423 00424 fprintf(stderr, "Started allocated memory %lu\n", bb); 00425 /* load all Databaselets into memory, for efficiency */ 00426 unsigned char **charImages = 00427 (unsigned char**)malloc(vecDatabaselet.size()*sizeof(unsigned char*)); 00428 size_t totSize = 0; 00429 size_t numPairs = first->m_iGenes * sizes[bb]; 00430 00431 for(i=0; i<vecDatabaselet.size(); i++){ 00432 totSize += vecDatabaselet[i]->GetSizePair() * numPairs; 00433 } 00434 charImages[0] = (unsigned char*)malloc(totSize*sizeof(unsigned char)); 00435 for(i=1; i<vecDatabaselet.size(); i++){ 00436 charImages[i] = charImages[i-1] + vecDatabaselet[i-1]->GetSizePair() * numPairs; 00437 } 00438 00439 fprintf(stderr, "Finished allocated memory\n"); 00440 00441 fprintf(stderr, "Started reading into memory\n"); 00442 00443 /* read databaselet into charImages */ 00444 for(i=0; i<vecDatabaselet.size(); i++){ 00445 CDatabaselet *current = vecDatabaselet[i]; 00446 if(!current->m_fstm.is_open()){ 00447 cerr << "CDatabaselet is not opened. Opening..." << endl; 00448 current->m_fstm.open( current->strFileName.c_str( ), ios_base::binary | ios_base::in ); 00449 } 00450 current->m_fstm.seekg(current->m_iHeader + current->GetSizePair()*first->m_iGenes*sofar, ios_base::beg); 00451 current->m_fstm.read((char*) charImages[i], current->GetSizePair() * numPairs); 00452 } 00453 00454 fprintf(stderr, "Finished reading into memory\n"); 00455 00456 //Global gene name mapping 00457 map<string, size_t> mapstrintGenes; 00458 for(i=0; i<vecstrGenes.size(); i++){ 00459 mapstrintGenes[vecstrGenes[i]] = i; 00460 } 00461 00462 char acNumber[16]; 00463 00464 /* splitting to one gene per file after combine */ 00465 if(bSplit){ 00466 00467 for(i=0; i<sizes[bb]; i++){ 00468 /* open a new Databaselet containing only one gene */ 00469 string thisGene = first->GetGene(sofar + i); 00470 size_t iGeneID = mapstrintGenes[thisGene]; 00471 sprintf(acNumber, "%08lu", iGeneID); 00472 00473 string path = strOutDirectory + "/" + acNumber + ".db"; 00474 vector<string> vecstrThisGene; 00475 vecstrThisGene.push_back(thisGene); 00476 00477 //fprintf(stderr, "Setting up gene %d in %d\n", i, bb); 00478 00479 CDatabaselet DBS(first->m_useNibble); 00480 DBS.Open(path.c_str(), vecstrThisGene, first->m_iGenes, iDatasets); 00481 00482 /* Create a new Databaselet */ 00483 size_t iSize; 00484 unsigned char *abImage = (unsigned char*) 00485 malloc( iSize = (DBS.GetSizeGene( ) * DBS.m_vecstrGenes.size( ) )); 00486 size_t iDatum; 00487 size_t iGeneOne, iGeneTwo; 00488 size_t offset2, offset3; 00489 iGeneOne = i; 00490 00491 //fprintf(stderr, "Finished setting up abImage gene %d in %d\n", i, bb); 00492 00493 if(first->m_useNibble==false){ 00494 /* m_iGenes is all the genes in the genome */ 00495 for( iGeneTwo = 0; iGeneTwo < first->m_iGenes; ++iGeneTwo ){ 00496 offset2 = DBS.GetSizePair()*iGeneTwo; 00497 int totalSum = 0; 00498 for( iDatum = 0; iDatum < vecDatabaselet.size(); iDatum ++ ){ 00499 vector<unsigned char> vc; 00500 CDatabaselet *current = vecDatabaselet[iDatum]; 00501 size_t offset_c = current->GetSizeGene() * i + current->GetSizePair() * iGeneTwo; 00502 current->Get( offset_c, vc, charImages[iDatum]); 00503 offset3 = offset2 + totalSum; 00504 for(j=0; j<vc.size(); j++){ 00505 abImage[offset3 + j] = vc[j]; 00506 } 00507 totalSum+=vc.size(); 00508 } 00509 } 00510 }else{ 00511 size_t j; 00512 unsigned char *abImage2 = (unsigned char*)malloc(iDatasets); 00513 00514 /* m_iGenes is all the genes in the genome */ 00515 for( iGeneTwo = 0; iGeneTwo < first->m_iGenes; ++iGeneTwo ){ 00516 offset2 = DBS.GetSizePair() * iGeneTwo; 00517 int totalSum = 0; 00518 for( iDatum = 0; iDatum < vecDatabaselet.size(); iDatum ++ ){ 00519 vector<unsigned char> vc; 00520 CDatabaselet *current = vecDatabaselet[iDatum]; 00521 size_t offset_c = current->GetSizeGene() * i + current->GetSizePair() * iGeneTwo; 00522 current->Get( offset_c, vc, charImages[iDatum]); 00523 offset3 = totalSum; 00524 for(j=0; j<vc.size(); j++){ 00525 abImage2[offset3+j] = vc[j]; 00526 } 00527 totalSum+=vc.size(); 00528 } 00529 for(j=0; j+1 < iDatasets; j+=2){ 00530 abImage[offset2 + j / 2] = (abImage2[j] & 0xF) | (abImage2[j+1] << 4); 00531 } 00532 if(j<iDatasets){ 00533 unsigned char bValue = abImage2[iDatasets - 1]; 00534 unsigned char b = 255; 00535 abImage[offset2 + j / 2] = ( bValue & 0xF ) | ( b & 0xF0 ); 00536 } 00537 } 00538 00539 free(abImage2); 00540 } 00541 00542 //fprintf(stderr, "Writing abImage gene %d in %d\n", i, bb); 00543 00544 /* close fstream */ 00545 if(!DBS.m_fstm.is_open()){ 00546 cerr << "CDatabaselet is not opened. Opening..." << endl; 00547 DBS.m_fstm.open(DBS.strFileName.c_str( ), ios_base::binary | ios_base::in); 00548 } 00549 DBS.m_fstm.seekp( DBS.m_iHeader, ios_base::beg ); 00550 DBS.m_fstm.write( (char*)abImage, iSize ); 00551 DBS.m_fstm.close(); 00552 free(abImage); 00553 00554 fprintf(stderr, "Finished writing abImage gene %lu (of %lu) in %lu (of %lu)\n", i, 00555 sizes[bb], bb, sizes.size()); 00556 00557 } 00558 00559 /* do not split, just combine into one file */ 00560 }else{ 00561 00562 vector<string> strTok; 00563 CMeta::Tokenize(first->strFileName.c_str(), strTok, "/"); 00564 string path = strOutDirectory + "/" + strTok[strTok.size()-1]; 00565 00566 CDatabaselet DBS(first->m_useNibble); 00567 if(bb==0){ 00568 DBS.Open(path.c_str(), first->m_vecstrGenes, first->m_iGenes, iDatasets); 00569 }else{ 00570 DBS.OpenNoOverwrite(); 00571 } 00572 00573 size_t iDatum; 00574 size_t iSize; 00575 unsigned char *abImage = (unsigned char*) 00576 malloc( iSize = (DBS.GetSizeGene( ) * sizes[bb] ) ); 00577 size_t iGeneOne, iGeneTwo; 00578 size_t offset1, offset2, offset3; 00579 00580 fprintf(stderr, "GetSizeGene() %d\n", DBS.GetSizeGene()); 00581 fprintf(stderr, "GetSizePair() %d\n", DBS.GetSizePair()); 00582 fprintf(stderr, "first->m_iGenes %d\n", first->m_iGenes); 00583 if(first->m_useNibble==false){ 00584 for(iGeneOne = 0; iGeneOne < sizes[bb]; ++iGeneOne){ 00585 offset1 = DBS.GetSizeGene() * iGeneOne; 00586 for( iGeneTwo = 0; iGeneTwo < first->m_iGenes; ++iGeneTwo ){ 00587 offset2 = DBS.GetSizePair()*iGeneTwo; 00588 int totalSum = 0; 00589 for( iDatum = 0; iDatum < vecDatabaselet.size(); iDatum ++ ){ 00590 vector<unsigned char> vc; 00591 CDatabaselet *current = vecDatabaselet[iDatum]; 00592 //size_t offset_c = current->GetSizeGene() * iGeneOne + current->GetSizePair() * iGeneTwo; 00593 //current->Get( offset1 + offset2, vc, charImages[iDatum]); 00594 //current->Get( iGeneOne, iGeneTwo, vc, charImages[iDatum]); 00595 current->Get(current->GetSizeGene() * iGeneOne + current->GetSizePair() * iGeneTwo, vc, charImages[iDatum]); 00596 offset3 = offset1 + offset2 + totalSum; 00597 for(j=0; j<vc.size(); j++){ 00598 abImage[offset3 + j] = vc[j]; 00599 } 00600 totalSum+=vc.size(); 00601 } 00602 } 00603 } 00604 }else{ 00605 size_t j; 00606 unsigned char *abImage2 = (unsigned char*) 00607 malloc(DBS.m_iDatasets); 00608 /* m_iGenes is all the genes in the genome */ 00609 for(iGeneOne = 0; iGeneOne < sizes[bb]; ++iGeneOne){ 00610 offset1 = DBS.GetSizeGene() * iGeneOne; 00611 for( iGeneTwo = 0; iGeneTwo < first->m_iGenes; ++iGeneTwo ){ 00612 offset2 = DBS.GetSizePair()*iGeneTwo; 00613 int totalSum = 0; 00614 for( iDatum = 0; iDatum < vecDatabaselet.size(); iDatum ++ ){ 00615 vector<unsigned char> vc; 00616 CDatabaselet *current = vecDatabaselet[iDatum]; 00617 current->Get(current->GetSizeGene()*iGeneOne + current->GetSizePair()*iGeneTwo, vc, charImages[iDatum]); 00618 //current->Get( offset1 + offset2, vc, charImages[iDatum]); 00619 //current->Get( iGeneOne, iGeneTwo, vc, charImages[iDatum]); 00620 offset3 = totalSum; 00621 for(j=0; j<vc.size(); j++){ 00622 abImage2[offset3 + j] = vc[j]; 00623 } 00624 totalSum+=vc.size(); 00625 } 00626 for(j=0; j+1 < iDatasets; j+=2){ 00627 abImage[offset1 + offset2 + j / 2] = (abImage2[j] & 0xF) | (abImage2[j+1] << 4); 00628 } 00629 if(j<iDatasets){ 00630 unsigned char bValue = abImage2[iDatasets - 1]; 00631 unsigned char b = 255; 00632 abImage[offset1 + offset2 + j / 2] = ( bValue & 0xF ) | ( b & 0xF0 ); 00633 } 00634 } 00635 } 00636 free(abImage2); 00637 } 00638 00639 /* close the databaselet */ 00640 if(!DBS.m_fstm.is_open()){ 00641 cerr << "CDatabaselet is not opened." << endl; 00642 DBS.m_fstm.open(DBS.strFileName.c_str( ), ios_base::binary | ios_base::in); 00643 } 00644 //DBS.m_fstm.seekp( DBS.m_iHeader, ios_base::beg ); 00645 DBS.m_fstm.seekg(DBS.m_iHeader + DBS.GetSizePair()*first->m_iGenes*sofar, ios_base::beg); 00646 DBS.m_fstm.write( (char*)abImage, iSize ); 00647 DBS.m_fstm.close(); 00648 free(abImage); 00649 } 00650 00651 free(charImages[0]); 00652 free(charImages); 00653 00654 } 00655 00656 return true; 00657 } 00658 00659 bool CDatabaselet::Get( size_t iOne, size_t iTwo, vector<unsigned char>& vecbData ) const { 00660 size_t i; 00661 00662 i = vecbData.size( ); 00663 vecbData.resize( i + GetSizePair( ) ); 00664 pthread_mutex_lock( m_pmutx ); 00665 00666 m_fstm.seekg( GetOffset( iOne, iTwo ), ios_base::beg); 00667 m_fstm.read( (char*)&vecbData[ i ], GetSizePair( ) ); 00668 pthread_mutex_unlock( m_pmutx ); 00669 00670 return true; } 00671 00672 bool CDatabaselet::Get( size_t iGene, vector<unsigned char>& vecbData, bool fReplace ) const { 00673 size_t i; 00674 00675 i = fReplace ? 0 : vecbData.size( ); 00676 vecbData.resize( i + GetSizeGene( ) ); 00677 pthread_mutex_lock( m_pmutx ); 00678 00679 m_fstm.seekg( GetOffset( iGene ), ios_base::beg); 00680 m_fstm.read( (char*)&vecbData[ i ], GetSizeGene( ) ); 00681 pthread_mutex_unlock( m_pmutx ); 00682 00683 return true; } 00684 00685 bool CDatabaselet::Get( size_t iGene, const vector<size_t>& veciGenes, vector<unsigned char>& vecbData, 00686 bool fReplace ) const { 00687 size_t i, iOffset; 00688 00689 iOffset = fReplace ? 0 : vecbData.size( ); 00690 vecbData.resize( iOffset + ( veciGenes.size( ) * GetSizePair( ) ) ); 00691 pthread_mutex_lock( m_pmutx ); 00692 for( i = 0; i < veciGenes.size( ); ++i,iOffset += GetSizePair( ) ) { 00693 m_fstm.seekg( GetOffset( iGene, veciGenes[ i ] ), ios_base::beg); 00694 m_fstm.read( (char*)&vecbData[ iOffset ], GetSizePair( ) ); 00695 } 00696 pthread_mutex_unlock( m_pmutx ); 00697 00698 return true; } 00699 00700 bool CDatabaselet::Open( const std::string& strFile ) { 00701 uint32_t iSize; 00702 char* acBuffer; 00703 char* pc; 00704 size_t i; 00705 00706 m_fstm.clear( ); 00707 m_fstm.open( strFile.c_str( ), ios_base::binary | ios_base::in ); 00708 if( !m_fstm.is_open( ) ) { 00709 g_CatSleipnir( ).error( "CDatabaselet::Open( %s ) open failed", strFile.c_str( ) ); 00710 return false; 00711 } 00712 00713 m_fstm.read( (char*)&m_iHeader, sizeof(m_iHeader) ); 00714 m_fstm.read( (char*)&m_iGenes, sizeof(m_iGenes) ); 00715 m_fstm.read( (char*)&m_iDatasets, sizeof(m_iDatasets) ); 00716 m_fstm.read( (char*)&iSize, sizeof(iSize) ); 00717 00718 acBuffer = new char[ m_iHeader ]; 00719 m_fstm.read( acBuffer, m_iHeader - sizeof(m_iHeader) - sizeof(m_iGenes) - sizeof(m_iDatasets) - sizeof(iSize) ); 00720 m_vecstrGenes.resize( iSize ); 00721 for( i = 0,pc = acBuffer; i < m_vecstrGenes.size( ); pc += m_vecstrGenes[ i++ ].length( ) + 1 ) 00722 m_vecstrGenes[ i ] = pc; 00723 delete[] acBuffer; 00724 00725 SetFile(strFile); 00726 00727 return true; } 00728 00730 // CDatabase 00732 00733 bool CDatabase::Open( const std::vector<std::string>& vecstrGenes, const std::string& strInputDirectory, 00734 const IBayesNet* pBayesNet, const std::string& strOutputDirectory, size_t iFiles, const map<string, size_t>& mapstriZeros) { 00735 vector<string> vecstrNodes, vecstrSubset; 00736 size_t i, j; 00737 char acNumber[ 16 ]; 00738 string strFile; 00739 00740 if( !pBayesNet ) { 00741 g_CatSleipnir( ).error( "CDatabase::Open( %s, %d ) null Bayes net", strOutputDirectory.c_str( ), iFiles ); 00742 return false; } 00743 00744 Clear( ); 00745 pBayesNet->GetNodes( vecstrNodes ); 00746 for( i = 1; i < vecstrNodes.size( ); ++i ) 00747 vecstrNodes[ i - 1 ] = strInputDirectory + '/' + vecstrNodes[ i ]; 00748 00749 if( vecstrNodes.size( ) ) 00750 vecstrNodes.resize( vecstrNodes.size( ) - 1 ); 00751 m_vecpDBs.resize( iFiles ); 00752 00753 int iNumFilesOpen = 1000; 00754 for( i = 0; i < m_vecpDBs.size( ); ++i ) { 00755 m_vecpDBs[ i ] = new CDatabaselet( m_useNibble ); 00756 } 00757 00758 size_t k; 00759 for( i = 0; i < m_vecpDBs.size( ); ++i ) { 00760 if(i%iNumFilesOpen==0 && i>0){ 00761 for(k=0; k<iNumFilesOpen; k++){ 00762 m_vecpDBs[i-k-1]->CloseFile(); 00763 } 00764 } 00765 vecstrSubset.clear( ); 00766 for( j = i; j < vecstrGenes.size( ); j += m_vecpDBs.size( ) ) 00767 vecstrSubset.push_back( vecstrGenes[ j ] ); 00768 #pragma warning(disable : 4996) 00769 sprintf( acNumber, "%08lu", i ); 00770 #pragma warning(default : 4996) 00771 strFile = strOutputDirectory + '/' + acNumber + c_acExtension; 00772 if( !( i % 100 ) ) 00773 g_CatSleipnir( ).notice( "CDatabase::Open( %s, %d ) initializing file %d/%d", 00774 strOutputDirectory.c_str( ), iFiles, i, m_vecpDBs.size( ) ); 00775 if( !m_vecpDBs[ i ]->Open( strFile, vecstrSubset, vecstrGenes.size( ), vecstrNodes.size( ) ) ) { 00776 g_CatSleipnir( ).error( "CDatabase::Open( %s, %d ) could not open file %s", 00777 strOutputDirectory.c_str( ), iFiles, strFile.c_str( ) ); 00778 return false; } } 00779 00780 for( i = 0; i < m_vecpDBs.size( ); ++i ){ 00781 m_vecpDBs[i]->CloseFile(); 00782 } 00783 00784 for( i = 0; i < vecstrGenes.size( ); ++i ) 00785 m_mapstriGenes[ m_vecpDBs[ i % m_vecpDBs.size( ) ]->GetGene( i / m_vecpDBs.size( ) ) ] = i; 00786 00787 return CDatabaseImpl::Open( vecstrGenes, vecstrNodes, mapstriZeros ); 00788 } 00789 00790 /* Version of Open() that takes a list of datasets as input. Key method */ 00791 bool CDatabase::Open( const std::vector<std::string>& vecstrGenes, const std::vector<std::string>& vecstrDatasets, 00792 const std::string& strInputDirectory, const std::string& strOutputDirectory, size_t iFiles, const map<string, size_t>& mapstriZeros){ 00793 00794 vector<string> vecstrNodes, vecstrSubset; 00795 size_t i, j; 00796 char acNumber[ 16 ]; 00797 string strFile; 00798 00799 Clear(); 00800 00801 vecstrNodes.resize(vecstrDatasets.size()); 00802 for(i=0; i<vecstrDatasets.size(); i++){ 00803 vecstrNodes[i] = strInputDirectory + '/' + vecstrDatasets[i]; 00804 } 00805 00806 m_vecpDBs.resize( iFiles ); 00807 int iNumFilesOpen = 1000; 00808 00809 for( i = 0; i < m_vecpDBs.size( ); ++i ) { 00810 m_vecpDBs[ i ] = new CDatabaselet( m_useNibble ); 00811 } 00812 00813 size_t k; 00814 00815 for( i = 0; i < m_vecpDBs.size( ); ++i ) { //block size (such as 1000) 00816 if(i%iNumFilesOpen==0 && i>0){ 00817 for(k=0; k<iNumFilesOpen; k++){ 00818 m_vecpDBs[i-k-1]->CloseFile(); 00819 } 00820 } 00821 vecstrSubset.clear( ); 00822 for( j = i; j < vecstrGenes.size( ); j += m_vecpDBs.size( ) ) 00823 vecstrSubset.push_back( vecstrGenes[ j ] ); //contains index for 1000, 2000, 3000th genes 00824 sprintf( acNumber, "%08lu", i ); 00825 strFile = strOutputDirectory + '/' + acNumber + c_acExtension; 00826 00827 if( !( i % 100 ) ) 00828 g_CatSleipnir( ).notice( "CDatabase::Open( %s, %d ) initializing file %d/%d", 00829 strOutputDirectory.c_str( ), iFiles, i, m_vecpDBs.size( ) ); 00830 if( !m_vecpDBs[ i ]->Open( strFile, vecstrSubset, vecstrGenes.size( ), vecstrNodes.size( ) ) ) { 00831 g_CatSleipnir( ).error( "CDatabase::Open( %s, %d ) could not open file %s", 00832 strOutputDirectory.c_str( ), iFiles, strFile.c_str( ) ); 00833 return false; } } 00834 00835 for( i = 0; i < m_vecpDBs.size( ); ++i ){ 00836 m_vecpDBs[i]->CloseFile(); 00837 } 00838 00839 for( i = 0; i < vecstrGenes.size( ); ++i ) 00840 m_mapstriGenes[ m_vecpDBs[ i % m_vecpDBs.size( ) ]->GetGene( i / m_vecpDBs.size( ) ) ] = i; 00841 00842 return CDatabaseImpl::Open( vecstrGenes, vecstrNodes, mapstriZeros ); 00843 } 00844 00845 /* the key Open() method for Data2DB conversion */ 00846 bool CDatabaseImpl::Open( const std::vector<std::string>& vecstrGenes, 00847 const std::vector<std::string>& vecstrFiles, const map<string, size_t>& mapstriZeros ) { 00848 size_t i, j, k, iOne, iTwo, iOutBlock, iOutBase, iOutOffset, iInBlock, iInBase, iInOffset; 00849 vector<size_t> veciGenes; 00850 float d; 00851 map<string, size_t>::const_iterator iterZero; 00852 00853 veciGenes.resize( vecstrGenes.size( ) ); 00854 iOutBlock = ( m_iBlockOut == -1 ) ? m_vecpDBs.size( ) : m_iBlockOut; 00855 iInBlock = ( m_iBlockIn == -1 ) ? vecstrFiles.size( ) : m_iBlockIn; 00856 00857 int iNumFilesOpen = 1000; 00858 00859 /* blocking parameter, iOutBase: number of databaselets to process at a time */ 00860 for( iOutBase = 0; iOutBase < m_vecpDBs.size( ); iOutBase += iOutBlock ) { 00861 vector<string> vecstrMyGenes; 00862 vector<size_t> veciMyGenes; 00863 00864 for( iOutOffset = 0; ( iOutOffset < iOutBlock ) && ( ( iOutBase + iOutOffset ) < m_vecpDBs.size( ) ); 00865 ++iOutOffset ) { 00866 const CDatabaselet& DB = *m_vecpDBs[ iOutBase + iOutOffset ]; 00867 00868 for( i = 0; i < DB.GetGenes( ); ++i ) 00869 vecstrMyGenes.push_back( DB.GetGene( i ) ); 00870 } 00871 00872 veciMyGenes.resize( vecstrMyGenes.size( ) ); 00873 00874 for( iInBase = 0; iInBase < vecstrFiles.size( ); iInBase += iInBlock ) { 00875 vector<CCompactFullMatrix> vecData; 00876 vecData.resize( ( ( iInBase + iInBlock ) > vecstrFiles.size( ) ) ? 00877 ( vecstrFiles.size( ) - iInBase ) : iInBlock ); 00878 for( iInOffset = 0; iInOffset < vecData.size( ); ++iInOffset ) { 00879 CDataPair Dat; 00880 00881 if( !Dat.Open( (vecstrFiles[ iInBase + iInOffset ] + c_acDAB).c_str( ), 00882 false, m_fMemmap) ) { 00883 if( !Dat.Open( (vecstrFiles[ iInBase + iInOffset ] + c_acQDAB).c_str( ), false, m_fMemmap ) ) { 00884 g_CatSleipnir( ).error( "CDatabaseImpl::Open( ) could not open %s", 00885 (vecstrFiles[ iInBase + iInOffset ] + c_acDAB).c_str( ) ); 00886 g_CatSleipnir( ).error( "CDatabaseImpl::Open( ) could not open %s", 00887 (vecstrFiles[ iInBase + iInOffset ] + c_acQDAB).c_str( ) ); 00888 return false; 00889 } 00890 } 00891 00892 for( i = 0; i < veciMyGenes.size( ); ++i ) 00893 veciMyGenes[ i ] = Dat.GetGene( vecstrMyGenes[ i ] ); 00894 for( i = 0; i < veciGenes.size( ); ++i ) 00895 veciGenes[ i ] = Dat.GetGene( vecstrGenes[ i ] ); 00896 00897 if(m_useNibble){ 00898 vecData[ iInOffset ].Initialize( veciMyGenes.size( ), veciGenes.size( ), 16, true ); 00899 }else{ 00900 vecData[ iInOffset ].Initialize( veciMyGenes.size( ), veciGenes.size( ), (unsigned char) 256, true ); 00901 } 00902 00903 string strName = CMeta::Filename( CMeta::Deextension( CMeta::Basename( vecstrFiles[ iInBase + iInOffset ].c_str() ) ) ); 00904 size_t iZero = ( ( iterZero = mapstriZeros.find( strName ) ) == mapstriZeros.end( ) ) ? -1 : iterZero->second; 00905 //#pragma omp parallel for \ 00906 shared(Dat, veciGenes, veciMyGenes, vecData) \ 00907 private(j, i) \ 00908 schedule(static) 00909 00910 for(i=0; i<veciMyGenes.size(); i++){ 00911 size_t iOne = veciMyGenes[i]; 00912 for(j=0; j<veciGenes.size(); j++){ 00913 size_t iTwo = veciGenes[j]; 00914 size_t iVal = -1; 00915 if ( iOne != -1 && iTwo != -1 ) { 00916 if( !CMeta::IsNaN( d = Dat.Get( iOne, iTwo ) ) ) 00917 iVal = Dat.Quantize(d); 00918 if ( iVal == -1 ) 00919 iVal = iZero; 00920 } 00921 if ( iVal != -1 ) 00922 vecData[iInOffset].Set(i,j,iVal+1); 00923 } 00924 } 00925 00926 00927 } 00928 00929 for( i = iOutOffset = 0; ( iOutOffset < iOutBlock ) && ( ( iOutBase + iOutOffset ) < 00930 m_vecpDBs.size( ) ); ++iOutOffset ) { 00931 CDatabaselet& DB = *m_vecpDBs[ iOutBase + iOutOffset ]; 00932 00933 /* close files if too many file handles opened */ 00934 /*if(iOutOffset>0 && (iOutOffset % iNumFilesOpen==0 || 00935 (iOutOffset + 1) >= iOutBlock || (iOutBase + iOutOffset + 1) >= m_vecpDBs.size() ) ){ 00936 for(k=0; k<iNumFilesOpen; k++){ 00937 if(iOutOffset + iOutBase - k - 1 < 0){ 00938 break; 00939 } 00940 m_vecpDBs[iOutOffset + iOutBase - 1 - k]->CloseFile(); 00941 } 00942 }*/ 00943 00944 DB.OpenNoOverwrite(); 00945 00946 if( !( iOutOffset % 100 ) ) 00947 cerr << "Processing offset " << iOutOffset << '/' << iOutBlock << endl; 00948 //iInBase, for B=2, iInBase = 0, 2, 4 (total 5 datasets) 00949 //i = 0 18 36 ... 00950 if( !DB.Open( vecData, i, iInBase, m_fBuffer ) ){ 00951 return false; 00952 } 00953 00954 i += DB.GetGenes( ); 00955 00956 DB.CloseFile(); 00957 } 00958 } 00959 } 00960 00961 return true; 00962 } 00963 00964 bool CDatabase::Open( const std::string& strInputDirectory ) { 00965 size_t i, j; 00966 vector<string> vecstrFiles; 00967 string strFile; 00968 00969 FOR_EACH_DIRECTORY_FILE(strInputDirectory, strFile) 00970 if( !CMeta::IsExtension( strFile, c_acExtension ) ) 00971 continue; 00972 00973 i = atoi( CMeta::Deextension( CMeta::Basename( strFile.c_str( ) ) ).c_str( ) ); 00974 if( vecstrFiles.size( ) <= i ) 00975 vecstrFiles.resize( i + 1 ); 00976 else if( vecstrFiles[ i ].length( ) != 0 ) { 00977 g_CatSleipnir( ).error( "CDatabase::Open( %s ) duplicate file: %s (%d)", strInputDirectory.c_str( ), 00978 strFile.c_str( ), i ); 00979 return false; } 00980 vecstrFiles[ i ] = strInputDirectory + '/' + strFile; } 00981 00982 Clear( ); 00983 m_vecpDBs.resize( vecstrFiles.size( ) ); 00984 for( i = 0; i < m_vecpDBs.size( ); ++i ) { 00985 m_vecpDBs[ i ] = new CDatabaselet( m_useNibble); 00986 if( !m_vecpDBs[ i ]->Open( vecstrFiles[ i ] ) ) 00987 return false; 00988 for( j = 0; j < m_vecpDBs[ i ]->GetGenes( ); ++j ) 00989 m_mapstriGenes[ m_vecpDBs[ i ]->GetGene( j ) ] = ( j * m_vecpDBs.size( ) ) + i; } 00990 00991 return true; 00992 } 00993 00994 bool CDatabaselet::Set(uint32_t &iGenes, uint32_t &iDatasets, vector<string> &vecstrSubsetGenes){ 00995 size_t i; 00996 m_vecstrGenes.clear(); 00997 m_vecstrGenes.resize(vecstrSubsetGenes.size()); 00998 for(i=0; i<vecstrSubsetGenes.size(); i++){ 00999 m_vecstrGenes[i] = vecstrSubsetGenes[i]; 01000 } 01001 01002 m_iGenes = iGenes; 01003 m_iDatasets = iDatasets; 01004 uint32_t iSize = m_vecstrGenes.size( ); 01005 m_iHeader = sizeof(m_iHeader) + sizeof(m_iGenes) + sizeof(m_iDatasets) + sizeof(iSize); 01006 for( i = 0; i < m_vecstrGenes.size( ); ++i ) { 01007 m_iHeader += m_vecstrGenes[ i ].size( ) + 1; 01008 } 01009 01010 return true; 01011 } 01012 01013 //Create a copy of current CDatabase collection that has X number of CDatabaselets 01014 bool CDatabase::Reorganize(const char *dest_db_dir, const size_t &num_db){ 01015 int dest_db = num_db; 01016 size_t i, j, l; 01017 const char c_acExtension[] = ".db"; 01018 char acNumber[16]; 01019 01020 vector<string> vecstrG; 01021 vecstrG.resize(m_mapstriGenes.size()); 01022 01023 for(map<string,size_t>::iterator iter=m_mapstriGenes.begin(); 01024 iter!=m_mapstriGenes.end(); iter++){ 01025 string first = iter->first; 01026 size_t second = iter->second; 01027 vecstrG[second] = first; 01028 } 01029 01030 for(i=0; i<dest_db; i++){ 01031 vector<string> vecstrSubset; 01032 for(j=i; j<vecstrG.size(); j+=dest_db) 01033 vecstrSubset.push_back(vecstrG[j]); 01034 //size of this db 01035 size_t iSize = GetGenes() * GetDatasets() * vecstrSubset.size(); 01036 01037 unsigned char *Qt = (unsigned char*)malloc(iSize); 01038 int tot = 0; 01039 for(j=0; j<vecstrSubset.size(); j++){ 01040 size_t k = m_mapstriGenes.find(vecstrSubset[j])->second; 01041 vector<unsigned char> Qi; 01042 if(!GetGene(k, Qi)){ 01043 cerr << "Gene error" << endl; 01044 continue; 01045 } 01046 for(l=0; l<Qi.size(); l++) 01047 Qt[tot+l] = Qi[l]; 01048 tot+=Qi.size(); 01049 } 01050 sprintf(acNumber, "%08lu", i); 01051 string dest_dir = dest_db_dir; 01052 string strFile = dest_dir + "/" + acNumber + c_acExtension; 01053 01054 CDatabaselet DBS(false); 01055 DBS.Open(strFile.c_str(), vecstrSubset, vecstrG.size(), 01056 GetDatasets()); 01057 DBS.Write((char*) Qt, iSize, 0); 01058 free(Qt); 01059 } 01060 return true; 01061 } 01062 01063 //For SeekMiner 01064 bool CDatabase::Open(const string &strDBDirectory, 01065 const vector<string> &vecstrGenes, const size_t &iDatasets, const size_t &iNumDBs){ 01066 return CDatabase::Open(strDBDirectory.c_str(), vecstrGenes, iDatasets, iNumDBs); 01067 } 01068 01069 //For SeekMiner 01070 bool CDatabase::Open(const char *db_dir, const vector<string> &vecstrGenes, 01071 const size_t &iDatasets, const size_t &iNumDBs){ 01072 size_t i, j, k; 01073 Clear(); 01074 m_vecpDBs.resize(iNumDBs); 01075 char acNumber[ 16 ]; 01076 01077 for( i = 0; i < m_vecpDBs.size( ); ++i ) 01078 m_vecpDBs[ i ] = new CDatabaselet( m_useNibble ); 01079 01080 string strDBDirectory = db_dir; 01081 01082 for(i=0; i<iNumDBs; i++){ 01083 vector<string> vecstrSubset; 01084 vecstrSubset.clear( ); 01085 for( j = i; j < vecstrGenes.size( ); j += m_vecpDBs.size( ) ) 01086 vecstrSubset.push_back( vecstrGenes[ j ] ); //contains index for 1000, 2000, 3000th genes 01087 sprintf( acNumber, "%08lu", i ); 01088 string strFile = strDBDirectory + '/' + acNumber + c_acExtension; 01089 uint32_t iGenes = vecstrGenes.size(); 01090 uint32_t iDset = iDatasets; 01091 m_vecpDBs[i]->Set(iGenes, iDset, vecstrSubset); 01092 m_vecpDBs[i]->SetFile(strFile); 01093 } 01094 01095 for( i = 0; i < vecstrGenes.size( ); ++i ){ 01096 string s = m_vecpDBs[ i % m_vecpDBs.size( ) ]->GetGene( i / m_vecpDBs.size( ) ); 01097 string file = m_vecpDBs[i % m_vecpDBs.size()]->GetFile(); 01098 //fprintf(stderr, "%s\t%s\t%d\n", s.c_str(), file.c_str(), i); 01099 m_mapstriGenes[ m_vecpDBs[ i % m_vecpDBs.size( ) ]->GetGene( i / m_vecpDBs.size( ) ) ] = i; 01100 } 01101 //fprintf(stderr, "Done\n"); getchar(); 01102 01103 return true; 01104 } 01105 01106 }