Sleipnir
src/database.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * Changes made Jun 2012 by Qian Zhu
00017 *
00018 * If you use this library, the included executable tools, or any related
00019 * code in your work, please cite the following publication:
00020 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00021 * Olga G. Troyanskaya.
00022 * "The Sleipnir library for computational functional genomics"
00023 *****************************************************************************/
00024 #include "stdafx.h"
00025 #include "database.h"
00026 #include "meta.h"
00027 #include "bayesnetint.h"
00028 #include "datapair.h"
00029 
00030 namespace Sleipnir {
00031 
00032 const char  CDatabaseImpl::c_acDAB[]        = ".dab";
00033 const char  CDatabaseImpl::c_acQDAB[]       = ".qdab";
00034 const char  CDatabaseImpl::c_acExtension[]  = ".db";
00035 
00037 // CDatabaselet
00039 
00040 CDatabaselet::CDatabaselet( bool useNibble) {
00041     m_useNibble = useNibble;
00042     m_pmutx = new pthread_mutex_t( );
00043     pthread_mutex_init( m_pmutx, NULL );
00044 }
00045 
00046 CDatabaselet::~CDatabaselet( ) {
00047 
00048     pthread_mutex_destroy( m_pmutx );
00049     delete m_pmutx;
00050     if(m_fstm.is_open()){
00051         m_fstm.close();
00052     }
00053 }
00054 
00055 /* original method for initializing databaselets, including writing header + pre-allocation */
00056 bool CDatabaselet::Open( const std::string& strFile, const std::vector<std::string>& vecstrGenes,
00057     uint32_t iGenes, uint32_t iDatasets ) {
00058     uint32_t    iSize;
00059     size_t      i;
00060     char*       acFiller;
00061 
00062     m_fstm.clear( );
00063     /* Open with overwriting */
00064     m_fstm.open( strFile.c_str( ), ios_base::in | ios_base::out | ios_base::binary | ios_base::trunc );
00065 
00066     if( !m_fstm.is_open( ) ) {
00067         g_CatSleipnir( ).error( "CDatabaselet::Open( %s, %u, %u ) open failed", strFile.c_str( ), iGenes,
00068             iDatasets );
00069         return false; }
00070 
00071 
00072     m_iGenes = iGenes;
00073     m_iDatasets = iDatasets;
00074     m_vecstrGenes.resize( vecstrGenes.size( ) );
00075     copy( vecstrGenes.begin( ), vecstrGenes.end( ), m_vecstrGenes.begin( ) );
00076 
00077     //allocate space
00078     m_fstm.write( (char*)&m_iHeader, sizeof(m_iHeader) );
00079     m_fstm.write( (char*)&m_iGenes, sizeof(m_iGenes) );
00080     m_fstm.write( (char*)&m_iDatasets, sizeof(m_iDatasets) );
00081 
00082     iSize = m_vecstrGenes.size( );
00083 
00084     m_fstm.write((char*)&iSize, sizeof(iSize));
00085 
00086     //write gene-name for only the genes in the databaselets
00087     m_iHeader = sizeof(m_iHeader) + sizeof(m_iGenes) + sizeof(m_iDatasets) + sizeof(iSize);
00088     for( i = 0; i < m_vecstrGenes.size( ); ++i ) {
00089         m_fstm.write( m_vecstrGenes[ i ].c_str( ), m_vecstrGenes[ i ].size( ) + 1);
00090         m_iHeader += m_vecstrGenes[ i ].size( ) + 1;
00091     }
00092 
00093     m_fstm.seekp( 0, ios_base::beg);
00094     m_fstm.write( (char*)&m_iHeader, sizeof(m_iHeader) );
00095 
00096     //pre-allocations
00097     m_fstm.seekp( m_iHeader, ios_base::beg );
00098     acFiller = new char[ GetSizeGene( ) ];
00099     memset( acFiller, -1, GetSizeGene( ) );
00100     for( i = 0; i < m_vecstrGenes.size( ); ++i ){
00101         m_fstm.write( acFiller, GetSizeGene( ) );
00102     }
00103     delete[] acFiller;
00104     SetFile(strFile);
00105 
00106     return true;
00107 
00108 }
00109 
00110 bool CDatabaselet::Write(char* Qt, const size_t &iSize, const size_t offset){
00111     m_fstm.seekg(m_iHeader + offset, ios_base::beg);
00112     m_fstm.write(Qt, iSize);
00113 }
00114 
00115 /* simply opens the file without overwriting */
00116 bool CDatabaselet::OpenNoOverwrite() {
00117     m_fstm.clear( );
00118     /* open without overwriting */
00119     m_fstm.open( strFileName.c_str( ), ios_base::in | ios_base::out | ios_base::binary);
00120 
00121     if( !m_fstm.is_open( ) ) {
00122         g_CatSleipnir( ).error( "CDatabaselet::Open( %s ) open failed", strFileName.c_str( ));
00123         return false;
00124     }
00125     return true;
00126 }
00127 
00128 bool CDatabaselet::OpenWrite( unsigned char bValue, size_t iOffset, ENibbles eNibbles,
00129     unsigned char* abImage ) {
00130     unsigned char   b;
00131 
00132     if(m_useNibble==0){
00133         eNibbles = ENibblesBoth;
00134     }
00135 
00136     if( abImage )
00137         iOffset -= m_iHeader;
00138     if( eNibbles != ENibblesBoth ) {
00139         if( abImage )
00140             b = abImage[ iOffset ];
00141         else {
00142             m_fstm.seekg( iOffset, ios_base::beg);
00143             b = m_fstm.get( );
00144         }
00145     }
00146 
00147     switch( eNibbles ) {
00148         case ENibblesLow:
00149             b = ( bValue & 0xF ) | ( b & 0xF0 );
00150             break;
00151 
00152         case ENibblesHigh:
00153             b = ( b & 0xF ) | ( bValue << 4 );
00154             break;
00155 
00156         case ENibblesBoth:
00157             b = bValue;
00158             break;
00159     }
00160     if( abImage )
00161         abImage[ iOffset ] = b;
00162     else {
00163         m_fstm.seekp( iOffset, ios_base::beg );
00164         m_fstm.put( b );
00165     }
00166 
00167     return true; }
00168 
00169 /* original file writing method */
00170 bool CDatabaselet::Open( const vector<CCompactFullMatrix>& vecData, size_t iBaseGenes, size_t iBaseDatasets, bool fBuffer ) {
00171     unsigned char*  abImage;
00172     size_t          iSize, iDatum, iGeneOne, iGeneTwo;
00173     unsigned char   bOne, bTwo;
00174 
00175     if( fBuffer ) {
00176         //iBaseGenes: gene id of first gene in each databaselet
00177         //iDataset: dataset id
00178         //printf("Number: %d %d %d %d\n", GetSizeGene(), GetSizePair(), iBaseGenes, iBaseDatasets);
00179         abImage = new unsigned char[ iSize = ( GetSizeGene( ) * m_vecstrGenes.size( ) ) ];
00180         m_fstm.seekg( m_iHeader, ios_base::beg );
00181         m_fstm.read( (char*)abImage, iSize );
00182     }
00183     else
00184         abImage = NULL;
00185 
00186     //vecData: # of genes in databaselet x # of genes user's list
00187     //if this is not the first dataset in the dataset block
00188     if( iBaseDatasets % 2 ){
00189         //iGeneOne: iterate over all genes in this databaselet (# of genes in each databaselet)
00190         for( iGeneOne = 0; iGeneOne < GetGenes( ); ++iGeneOne ){
00191             //iGeneTwo: iterate overall genes in user's gene list
00192             for( iGeneTwo = 0; iGeneTwo < vecData[ 0 ].GetColumns( ); ++iGeneTwo ){
00193                 //bOne, get the value of the gene located at the position (iBaseGene + iGeneOne, iGeneTwo)
00194                 if( bOne = vecData[ 0 ].Get( iBaseGenes + iGeneOne, iGeneTwo ) ){
00195                     //Offset is: m_iHeader + (GetSizeGene() * iOne) + (GetSizePair() * iTwo) + iDataset (for byte case)
00196                     OpenWrite( bOne - 1, GetOffset( iGeneOne, iGeneTwo, iBaseDatasets ), ENibblesHigh, abImage );
00197                 }
00198             }
00199         }
00200     }
00201 
00202     for( iDatum = ( iBaseDatasets % 2 ); ( iDatum + 1 ) < vecData.size( ); iDatum += 2 ){
00203         for( iGeneOne = 0; iGeneOne < GetGenes( ); ++iGeneOne ){
00204             for( iGeneTwo = 0; iGeneTwo < vecData[ iDatum ].GetColumns( ); ++iGeneTwo ) {
00205                 bOne = vecData[ iDatum ].Get( iBaseGenes + iGeneOne, iGeneTwo );
00206                 bTwo = vecData[ iDatum + 1 ].Get( iBaseGenes + iGeneOne, iGeneTwo );
00207                 if( !( bOne || bTwo ) )
00208                     continue;
00209                 bOne -= 1;
00210                 bTwo -= 1;
00211                 if(m_useNibble){
00212                     OpenWrite( ( bOne & 0xF ) | ( bTwo << 4 ), GetOffset( iGeneOne, iGeneTwo, iBaseDatasets +
00213                             iDatum ), ENibblesBoth, abImage );
00214                 }else{
00215                     OpenWrite( bOne, GetOffset( iGeneOne, iGeneTwo, iBaseDatasets + iDatum ), ENibblesBoth,
00216                             abImage );
00217                     OpenWrite( bTwo, GetOffset( iGeneOne, iGeneTwo, iBaseDatasets + iDatum + 1 ), ENibblesBoth,
00218                             abImage );
00219                 }
00220             }
00221         }
00222     }
00223 
00224     if( iDatum < vecData.size( ) ){
00225         for( iGeneOne = 0; iGeneOne < GetGenes( ); ++iGeneOne ){
00226             for( iGeneTwo = 0; iGeneTwo < vecData[ iDatum ].GetColumns( ); ++iGeneTwo ){
00227                 if( bOne = vecData[ iDatum ].Get( iBaseGenes + iGeneOne, iGeneTwo ) ){
00228                     OpenWrite( bOne - 1, GetOffset( iGeneOne, iGeneTwo, iBaseDatasets + iDatum ), ENibblesLow,
00229                         abImage );
00230                 }
00231             }
00232         }
00233     }
00234     if( fBuffer ) {
00235         m_fstm.seekp( m_iHeader, ios_base::beg );
00236         m_fstm.write( (char*)abImage, iSize );
00237         delete[] abImage;
00238     }
00239 
00240     return true; }
00241 
00242 bool CDatabase::GetGene(const string &strGene, vector<unsigned char> &vecbData) const{
00243     size_t iGene = this->GetGene(strGene);
00244     if(iGene==-1){
00245         cerr << "Gene not found" << endl;
00246         return false;
00247     }
00248     m_vecpDBs[ iGene % m_vecpDBs.size( ) ]->Get( iGene / m_vecpDBs.size( ), vecbData);
00249     return true;
00250 }
00251 
00252 bool CDatabase::GetGene(const size_t &iGene, vector<unsigned char> &vecbData) const{
00253     m_vecpDBs[ iGene % m_vecpDBs.size( ) ]->Get( iGene / m_vecpDBs.size( ), vecbData);
00254     return true;
00255 }
00256 
00257 /* mainly used by SeekMiner */
00258 bool CDatabaselet::Get(size_t iOne, vector<unsigned char>& vecbData){
00259     size_t iSize;
00260     size_t i, j;
00261     size_t offset1 = GetOffset(iOne);
00262     if(this->m_fstm.is_open()){
00263         cerr << "file is already opened" << endl;
00264     }else{
00265         m_fstm.clear( );
00266         m_fstm.open( this->strFileName.c_str( ), ios_base::binary | ios_base::in );
00267     }
00268 
00269     this->m_fstm.seekg(offset1, ios_base::beg);
00270     unsigned char *abImage = (unsigned char*)malloc( iSize = GetSizeGene());
00271     this->m_fstm.read((char*)abImage, iSize);
00272 
00273     m_fstm.close();
00274 
00275     if(m_useNibble==false){
00276         vecbData.clear();
00277         vecbData.resize(iSize);
00278         for(i=0; i<iSize; i++){
00279             vecbData[i] = abImage[i];
00280         }
00281     }else{
00282         vecbData.clear();
00283         vecbData.resize(m_iDatasets*m_iGenes);
00284 
00285         for(j=0; j<m_iGenes; j++){
00286             for(i=0; i<GetSizePair(); i++){
00287                 size_t offset = GetOffset(iOne, j) - m_iHeader;
00288                 unsigned char b = abImage[offset + i];
00289                 unsigned char bValue = -1;
00290                 if( ( bValue = ( b & 0xF ) ) == 0xF ){
00291                     bValue = -1;
00292                 }
00293                 vecbData[ j * m_iDatasets + 2 * i ] = bValue;
00294 
00295                 if( ( bValue = ( ( b >> 4 ) & 0xF ) ) == 0xF ){
00296                     bValue = -1;
00297                 }
00298 
00299                 if((2 * i + 1)==m_iDatasets){
00300                     break;
00301                 }
00302                 vecbData[ j * m_iDatasets + (2 * i) + 1 ] = bValue;
00303             }
00304         }
00305     }
00306     free(abImage);
00307     return true;
00308 }
00309 
00310 bool CDatabaselet::Get( size_t iOne, size_t iTwo,
00311         vector<unsigned char>& vecbData, unsigned char *charImage){
00312     size_t offset = GetOffset(iOne, iTwo) - m_iHeader;
00313     return Get(offset, vecbData, charImage);
00314 }
00315 
00316 bool CDatabaselet::Get(size_t offset, vector<unsigned char>& vecbData, unsigned char *charImage){
00317     size_t  i;
00318 
00319     if(this->m_useNibble==false){
00320         vecbData.clear();
00321         vecbData.resize(GetSizePair());
00322 
00323         for(i=0; i<vecbData.size(); i++){
00324             vecbData[i] = charImage[offset + i];
00325         }
00326     }else{
00327         vecbData.clear();
00328         vecbData.resize(m_iDatasets);
00329 
00330 
00331         for(i=0; i<GetSizePair(); i++){
00332             unsigned char b = charImage[offset + i];
00333             unsigned char bValue = -1;
00334             if( ( bValue = ( b & 0xF ) ) == 0xF ){
00335                 bValue = -1;
00336             }
00337             vecbData[ 2 * i ] = bValue;
00338 
00339             if( ( bValue = ( ( b >> 4 ) & 0xF ) ) == 0xF ){
00340                 bValue = -1;
00341             }
00342 
00343             if((2 * i + 1)==m_iDatasets){
00344                 break;
00345             }
00346             vecbData[ (2 * i) + 1 ] = bValue;
00347         }
00348     }
00349 
00350     return true;
00351 }
00352 
00353 /*  static function, combine multiple databaselets (that share the same genes, ie m_vecStrGenes),
00354     and output result to a single file, or output one-gene per file (if databaselet contains multiple genes)
00355     bSplit: whether or not to output one-gene per file
00356     Works for both nibble and byte
00357 */
00358 bool CDatabaselet::Combine(std::vector<CDatabaselet*>& vecDatabaselet,
00359         std::string strOutDirectory, vector<string> &vecstrGenes, bool bSplit){
00360 
00361     /* for checking on consistency of databaselets */
00362     bool bIsConsistent = true;
00363     bool fUseNibble;
00364 
00365     size_t i, j;
00366     uint32_t iGenes, iDatasets;
00367 
00368     CDatabaselet *first = vecDatabaselet[0];
00369     fUseNibble = first->m_useNibble;
00370 
00371     iGenes = first->GetGenes();
00372     iDatasets = first->GetDatasets();
00373 
00374     vector<string> vecGenes;
00375     vecGenes.resize(iGenes);
00376 
00377     for(i=0; i<iGenes; i++){
00378         vecGenes[i] = first->GetGene(i);
00379     }
00380 
00381     for(i=1; bIsConsistent && i<vecDatabaselet.size(); i++){
00382         if(iGenes!=vecDatabaselet[i]->GetGenes() || fUseNibble!=vecDatabaselet[i]->m_useNibble){
00383             bIsConsistent = false;
00384             break;
00385         }
00386         for(j=0; j<iGenes; j++){
00387             if(vecGenes[j]!=vecDatabaselet[i]->GetGene(j)){
00388                 bIsConsistent = false;
00389                 break;
00390             }
00391         }
00392         iDatasets+=vecDatabaselet[i]->GetDatasets();
00393     }
00394 
00395     if(!bIsConsistent){
00396         cerr << "Databaselets are not consistent!" << endl;
00397         return false;
00398     }
00399 
00400     /* adjustable parameter */
00401     int BUFFER = 80;
00402 
00403     if(BUFFER>iGenes){
00404         BUFFER = iGenes;
00405     }
00406 
00407 
00408     vector<size_t> sizes;
00409     int numTimes = iGenes / BUFFER;
00410     for(i=0; i<numTimes; i++){
00411         sizes.push_back(BUFFER);
00412     }
00413     if(iGenes % BUFFER > 0){
00414         numTimes++;
00415         sizes.push_back(iGenes % BUFFER);
00416     }
00417 
00418     size_t bb = 0;
00419     size_t sofar = 0;
00420     fprintf(stderr, "Number of times: %d\n", numTimes); 
00421     for(bb=0; bb<numTimes; sofar+=sizes[bb], bb++){
00422         fprintf(stderr, "bb sizes[bb] sofar: %d %d %d\n", bb, sizes[bb], sofar); 
00423 
00424         fprintf(stderr, "Started allocated memory %lu\n", bb);
00425         /* load all Databaselets into memory, for efficiency */
00426         unsigned char **charImages =
00427             (unsigned char**)malloc(vecDatabaselet.size()*sizeof(unsigned char*));
00428         size_t totSize = 0;
00429         size_t numPairs = first->m_iGenes * sizes[bb];
00430 
00431         for(i=0; i<vecDatabaselet.size(); i++){
00432             totSize += vecDatabaselet[i]->GetSizePair() * numPairs;
00433         }
00434         charImages[0] = (unsigned char*)malloc(totSize*sizeof(unsigned char));
00435         for(i=1; i<vecDatabaselet.size(); i++){
00436             charImages[i] = charImages[i-1] + vecDatabaselet[i-1]->GetSizePair() * numPairs;
00437         }
00438 
00439         fprintf(stderr, "Finished allocated memory\n");
00440 
00441         fprintf(stderr, "Started reading into memory\n");
00442 
00443         /* read databaselet into charImages */
00444         for(i=0; i<vecDatabaselet.size(); i++){
00445             CDatabaselet *current = vecDatabaselet[i];
00446             if(!current->m_fstm.is_open()){
00447                 cerr << "CDatabaselet is not opened. Opening..." << endl;
00448                 current->m_fstm.open( current->strFileName.c_str( ), ios_base::binary | ios_base::in );
00449             }
00450             current->m_fstm.seekg(current->m_iHeader + current->GetSizePair()*first->m_iGenes*sofar, ios_base::beg);
00451             current->m_fstm.read((char*) charImages[i], current->GetSizePair() * numPairs);
00452         }
00453 
00454         fprintf(stderr, "Finished reading into memory\n");
00455 
00456         //Global gene name mapping
00457         map<string, size_t> mapstrintGenes;
00458         for(i=0; i<vecstrGenes.size(); i++){
00459             mapstrintGenes[vecstrGenes[i]] = i;
00460         }
00461 
00462         char acNumber[16];
00463 
00464         /* splitting to one gene per file after combine */
00465         if(bSplit){
00466 
00467             for(i=0; i<sizes[bb]; i++){
00468                 /* open a new Databaselet containing only one gene */
00469                 string thisGene = first->GetGene(sofar + i);
00470                 size_t iGeneID = mapstrintGenes[thisGene];
00471                 sprintf(acNumber, "%08lu", iGeneID);
00472 
00473                 string path = strOutDirectory + "/" + acNumber + ".db";
00474                 vector<string> vecstrThisGene;
00475                 vecstrThisGene.push_back(thisGene);
00476 
00477                 //fprintf(stderr, "Setting up gene %d in %d\n", i, bb);
00478 
00479                 CDatabaselet DBS(first->m_useNibble);
00480                 DBS.Open(path.c_str(), vecstrThisGene, first->m_iGenes, iDatasets);
00481 
00482                 /* Create a new Databaselet */
00483                 size_t iSize;
00484                 unsigned char *abImage = (unsigned char*)
00485                     malloc( iSize = (DBS.GetSizeGene( ) * DBS.m_vecstrGenes.size( ) ));
00486                 size_t iDatum;
00487                 size_t iGeneOne, iGeneTwo;
00488                 size_t offset2, offset3;
00489                 iGeneOne = i;
00490 
00491                 //fprintf(stderr, "Finished setting up abImage gene %d in %d\n", i, bb);
00492 
00493                 if(first->m_useNibble==false){
00494                     /* m_iGenes is all the genes in the genome */
00495                     for( iGeneTwo = 0; iGeneTwo < first->m_iGenes; ++iGeneTwo ){
00496                         offset2 = DBS.GetSizePair()*iGeneTwo;
00497                         int totalSum = 0;
00498                         for( iDatum = 0; iDatum  < vecDatabaselet.size(); iDatum ++ ){
00499                             vector<unsigned char> vc;
00500                             CDatabaselet *current = vecDatabaselet[iDatum];
00501                             size_t offset_c = current->GetSizeGene() * i + current->GetSizePair() * iGeneTwo;
00502                             current->Get( offset_c, vc, charImages[iDatum]);
00503                             offset3 = offset2 + totalSum;
00504                             for(j=0; j<vc.size(); j++){
00505                                 abImage[offset3 + j] = vc[j];
00506                             }
00507                             totalSum+=vc.size();
00508                         }
00509                     }
00510                 }else{
00511                     size_t j;
00512                     unsigned char *abImage2 = (unsigned char*)malloc(iDatasets);
00513 
00514                     /* m_iGenes is all the genes in the genome */
00515                     for( iGeneTwo = 0; iGeneTwo < first->m_iGenes; ++iGeneTwo ){
00516                         offset2 = DBS.GetSizePair() * iGeneTwo;
00517                         int totalSum = 0;
00518                         for( iDatum = 0; iDatum  < vecDatabaselet.size(); iDatum ++ ){
00519                             vector<unsigned char> vc;
00520                             CDatabaselet *current = vecDatabaselet[iDatum];
00521                             size_t offset_c = current->GetSizeGene() * i + current->GetSizePair() * iGeneTwo;
00522                             current->Get( offset_c, vc, charImages[iDatum]);
00523                             offset3 = totalSum;
00524                             for(j=0; j<vc.size(); j++){
00525                                 abImage2[offset3+j] = vc[j];
00526                             }
00527                             totalSum+=vc.size();
00528                         }
00529                         for(j=0; j+1 < iDatasets; j+=2){
00530                             abImage[offset2 + j / 2] = (abImage2[j] & 0xF) | (abImage2[j+1] << 4);
00531                         }
00532                         if(j<iDatasets){
00533                             unsigned char bValue = abImage2[iDatasets - 1];
00534                             unsigned char b = 255;
00535                             abImage[offset2 + j / 2] = ( bValue & 0xF ) | ( b & 0xF0 );
00536                         }
00537                     }
00538 
00539                     free(abImage2);
00540                 }
00541 
00542                 //fprintf(stderr, "Writing abImage gene %d in %d\n", i, bb);
00543 
00544                 /* close fstream */
00545                 if(!DBS.m_fstm.is_open()){
00546                     cerr << "CDatabaselet is not opened. Opening..." << endl;
00547                     DBS.m_fstm.open(DBS.strFileName.c_str( ), ios_base::binary | ios_base::in);
00548                 }
00549                 DBS.m_fstm.seekp( DBS.m_iHeader, ios_base::beg );
00550                 DBS.m_fstm.write( (char*)abImage, iSize );
00551                 DBS.m_fstm.close();
00552                 free(abImage);
00553 
00554                 fprintf(stderr, "Finished writing abImage gene %lu (of %lu) in %lu (of %lu)\n", i, 
00555                     sizes[bb], bb, sizes.size());
00556 
00557             }
00558 
00559             /* do not split, just combine into one file */
00560         }else{
00561 
00562             vector<string> strTok;
00563             CMeta::Tokenize(first->strFileName.c_str(), strTok, "/");
00564             string path = strOutDirectory + "/" + strTok[strTok.size()-1];
00565 
00566             CDatabaselet DBS(first->m_useNibble);
00567             if(bb==0){
00568                 DBS.Open(path.c_str(), first->m_vecstrGenes, first->m_iGenes, iDatasets);
00569             }else{
00570                 DBS.OpenNoOverwrite();
00571             }
00572 
00573             size_t iDatum;
00574             size_t iSize;
00575             unsigned char *abImage = (unsigned char*)
00576                 malloc( iSize = (DBS.GetSizeGene( ) * sizes[bb] ) );
00577             size_t iGeneOne, iGeneTwo;
00578             size_t offset1, offset2, offset3;
00579 
00580             fprintf(stderr, "GetSizeGene() %d\n", DBS.GetSizeGene());
00581             fprintf(stderr, "GetSizePair() %d\n", DBS.GetSizePair());
00582             fprintf(stderr, "first->m_iGenes %d\n", first->m_iGenes);
00583             if(first->m_useNibble==false){
00584                 for(iGeneOne = 0; iGeneOne < sizes[bb]; ++iGeneOne){
00585                     offset1 = DBS.GetSizeGene() * iGeneOne;
00586                     for( iGeneTwo = 0; iGeneTwo < first->m_iGenes; ++iGeneTwo ){
00587                         offset2 = DBS.GetSizePair()*iGeneTwo;
00588                         int totalSum = 0;
00589                         for( iDatum = 0; iDatum  < vecDatabaselet.size(); iDatum ++ ){
00590                             vector<unsigned char> vc;
00591                             CDatabaselet *current = vecDatabaselet[iDatum];
00592                             //size_t offset_c = current->GetSizeGene() * iGeneOne + current->GetSizePair() * iGeneTwo;
00593                             //current->Get( offset1 + offset2, vc, charImages[iDatum]);
00594                             //current->Get( iGeneOne, iGeneTwo, vc, charImages[iDatum]);
00595                             current->Get(current->GetSizeGene() * iGeneOne + current->GetSizePair() * iGeneTwo, vc, charImages[iDatum]);
00596                             offset3 = offset1 + offset2 + totalSum;
00597                             for(j=0; j<vc.size(); j++){
00598                                 abImage[offset3 + j] = vc[j];
00599                             }
00600                             totalSum+=vc.size();
00601                         }
00602                     }
00603                 }
00604             }else{
00605                 size_t j;
00606                 unsigned char *abImage2 = (unsigned char*)
00607                     malloc(DBS.m_iDatasets);
00608                 /* m_iGenes is all the genes in the genome */
00609                 for(iGeneOne = 0; iGeneOne < sizes[bb]; ++iGeneOne){
00610                     offset1 = DBS.GetSizeGene() * iGeneOne;
00611                     for( iGeneTwo = 0; iGeneTwo < first->m_iGenes; ++iGeneTwo ){
00612                         offset2 = DBS.GetSizePair()*iGeneTwo;
00613                         int totalSum = 0;
00614                         for( iDatum = 0; iDatum  < vecDatabaselet.size(); iDatum ++ ){
00615                             vector<unsigned char> vc;
00616                             CDatabaselet *current = vecDatabaselet[iDatum];
00617                             current->Get(current->GetSizeGene()*iGeneOne + current->GetSizePair()*iGeneTwo, vc, charImages[iDatum]);
00618                             //current->Get( offset1 + offset2, vc, charImages[iDatum]);
00619                             //current->Get( iGeneOne, iGeneTwo, vc, charImages[iDatum]);
00620                             offset3 = totalSum;
00621                             for(j=0; j<vc.size(); j++){
00622                                 abImage2[offset3 + j] = vc[j];
00623                             }
00624                             totalSum+=vc.size();
00625                         }
00626                         for(j=0; j+1 < iDatasets; j+=2){
00627                             abImage[offset1 + offset2 + j / 2] = (abImage2[j] & 0xF) | (abImage2[j+1] << 4);
00628                         }
00629                         if(j<iDatasets){
00630                             unsigned char bValue = abImage2[iDatasets - 1];
00631                             unsigned char b = 255;
00632                             abImage[offset1 + offset2 + j / 2] = ( bValue & 0xF ) | ( b & 0xF0 );
00633                         }
00634                     }
00635                 }
00636                 free(abImage2);
00637             }
00638 
00639             /* close the databaselet */
00640             if(!DBS.m_fstm.is_open()){
00641                 cerr << "CDatabaselet is not opened." << endl;
00642                 DBS.m_fstm.open(DBS.strFileName.c_str( ), ios_base::binary | ios_base::in);
00643             }
00644             //DBS.m_fstm.seekp( DBS.m_iHeader, ios_base::beg );
00645             DBS.m_fstm.seekg(DBS.m_iHeader + DBS.GetSizePair()*first->m_iGenes*sofar, ios_base::beg);
00646             DBS.m_fstm.write( (char*)abImage, iSize );
00647             DBS.m_fstm.close();
00648             free(abImage);
00649         }
00650 
00651         free(charImages[0]);
00652         free(charImages);
00653 
00654     }
00655 
00656     return true;
00657 }
00658 
00659 bool CDatabaselet::Get( size_t iOne, size_t iTwo, vector<unsigned char>& vecbData ) const {
00660     size_t  i;
00661 
00662     i = vecbData.size( );
00663     vecbData.resize( i + GetSizePair( ) );
00664     pthread_mutex_lock( m_pmutx );
00665 
00666     m_fstm.seekg( GetOffset( iOne, iTwo ), ios_base::beg);
00667     m_fstm.read( (char*)&vecbData[ i ], GetSizePair( ) );
00668     pthread_mutex_unlock( m_pmutx );
00669 
00670     return true; }
00671 
00672 bool CDatabaselet::Get( size_t iGene, vector<unsigned char>& vecbData, bool fReplace ) const {
00673     size_t  i;
00674 
00675     i = fReplace ? 0 : vecbData.size( );
00676     vecbData.resize( i + GetSizeGene( ) );
00677     pthread_mutex_lock( m_pmutx );
00678 
00679     m_fstm.seekg( GetOffset( iGene ), ios_base::beg);
00680     m_fstm.read( (char*)&vecbData[ i ], GetSizeGene( ) );
00681     pthread_mutex_unlock( m_pmutx );
00682 
00683     return true; }
00684 
00685 bool CDatabaselet::Get( size_t iGene, const vector<size_t>& veciGenes, vector<unsigned char>& vecbData,
00686     bool fReplace ) const {
00687     size_t  i, iOffset;
00688 
00689     iOffset = fReplace ? 0 : vecbData.size( );
00690     vecbData.resize( iOffset + ( veciGenes.size( ) * GetSizePair( ) ) );
00691     pthread_mutex_lock( m_pmutx );
00692     for( i = 0; i < veciGenes.size( ); ++i,iOffset += GetSizePair( ) ) {
00693         m_fstm.seekg( GetOffset( iGene, veciGenes[ i ] ), ios_base::beg);
00694         m_fstm.read( (char*)&vecbData[ iOffset ], GetSizePair( ) );
00695     }
00696     pthread_mutex_unlock( m_pmutx );
00697 
00698     return true; }
00699 
00700 bool CDatabaselet::Open( const std::string& strFile ) {
00701     uint32_t    iSize;
00702     char*       acBuffer;
00703     char*       pc;
00704     size_t      i;
00705 
00706     m_fstm.clear( );
00707     m_fstm.open( strFile.c_str( ), ios_base::binary | ios_base::in );
00708     if( !m_fstm.is_open( ) ) {
00709         g_CatSleipnir( ).error( "CDatabaselet::Open( %s ) open failed", strFile.c_str( ) );
00710         return false;
00711     }
00712 
00713     m_fstm.read( (char*)&m_iHeader, sizeof(m_iHeader) );
00714     m_fstm.read( (char*)&m_iGenes, sizeof(m_iGenes) );
00715     m_fstm.read( (char*)&m_iDatasets, sizeof(m_iDatasets) );
00716     m_fstm.read( (char*)&iSize, sizeof(iSize) );
00717 
00718     acBuffer = new char[ m_iHeader ];
00719     m_fstm.read( acBuffer, m_iHeader - sizeof(m_iHeader) - sizeof(m_iGenes) - sizeof(m_iDatasets) - sizeof(iSize) );
00720     m_vecstrGenes.resize( iSize );
00721     for( i = 0,pc = acBuffer; i < m_vecstrGenes.size( ); pc += m_vecstrGenes[ i++ ].length( ) + 1 )
00722         m_vecstrGenes[ i ] = pc;
00723     delete[] acBuffer;
00724 
00725     SetFile(strFile);
00726 
00727     return true; }
00728 
00730 // CDatabase
00732 
00733 bool CDatabase::Open( const std::vector<std::string>& vecstrGenes, const std::string& strInputDirectory,
00734     const IBayesNet* pBayesNet, const std::string& strOutputDirectory, size_t iFiles, const map<string, size_t>& mapstriZeros) {
00735     vector<string>  vecstrNodes, vecstrSubset;
00736     size_t          i, j;
00737     char            acNumber[ 16 ];
00738     string          strFile;
00739 
00740     if( !pBayesNet ) {
00741         g_CatSleipnir( ).error( "CDatabase::Open( %s, %d ) null Bayes net", strOutputDirectory.c_str( ), iFiles );
00742         return false; }
00743 
00744     Clear( );
00745     pBayesNet->GetNodes( vecstrNodes );
00746     for( i = 1; i < vecstrNodes.size( ); ++i )
00747         vecstrNodes[ i - 1 ] = strInputDirectory + '/' + vecstrNodes[ i ];
00748 
00749     if( vecstrNodes.size( ) )
00750         vecstrNodes.resize( vecstrNodes.size( ) - 1 );
00751     m_vecpDBs.resize( iFiles );
00752 
00753     int iNumFilesOpen = 1000;
00754     for( i = 0; i < m_vecpDBs.size( ); ++i ) {
00755         m_vecpDBs[ i ] = new CDatabaselet( m_useNibble );
00756     }
00757 
00758     size_t k;
00759     for( i = 0; i < m_vecpDBs.size( ); ++i ) {
00760         if(i%iNumFilesOpen==0 && i>0){
00761             for(k=0; k<iNumFilesOpen; k++){
00762                 m_vecpDBs[i-k-1]->CloseFile();
00763             }
00764         }
00765         vecstrSubset.clear( );
00766         for( j = i; j < vecstrGenes.size( ); j += m_vecpDBs.size( ) )
00767             vecstrSubset.push_back( vecstrGenes[ j ] );
00768 #pragma warning(disable : 4996)
00769         sprintf( acNumber, "%08lu", i );
00770 #pragma warning(default : 4996)
00771         strFile = strOutputDirectory + '/' + acNumber + c_acExtension;
00772         if( !( i % 100 ) )
00773             g_CatSleipnir( ).notice( "CDatabase::Open( %s, %d ) initializing file %d/%d",
00774                 strOutputDirectory.c_str( ), iFiles, i, m_vecpDBs.size( ) );
00775         if( !m_vecpDBs[ i ]->Open( strFile, vecstrSubset, vecstrGenes.size( ), vecstrNodes.size( ) ) ) {
00776             g_CatSleipnir( ).error( "CDatabase::Open( %s, %d ) could not open file %s",
00777                 strOutputDirectory.c_str( ), iFiles, strFile.c_str( ) );
00778             return false; } }
00779 
00780     for( i = 0; i < m_vecpDBs.size( ); ++i ){
00781         m_vecpDBs[i]->CloseFile();
00782     }
00783 
00784     for( i = 0; i < vecstrGenes.size( ); ++i )
00785         m_mapstriGenes[ m_vecpDBs[ i % m_vecpDBs.size( ) ]->GetGene( i / m_vecpDBs.size( ) ) ] = i;
00786 
00787     return CDatabaseImpl::Open( vecstrGenes, vecstrNodes, mapstriZeros );
00788 }
00789 
00790 /* Version of Open() that takes a list of datasets as input. Key method */
00791 bool CDatabase::Open( const std::vector<std::string>& vecstrGenes, const std::vector<std::string>& vecstrDatasets,
00792     const std::string& strInputDirectory, const std::string& strOutputDirectory, size_t iFiles, const map<string, size_t>& mapstriZeros){
00793 
00794     vector<string>  vecstrNodes, vecstrSubset;
00795     size_t          i, j;
00796     char            acNumber[ 16 ];
00797     string          strFile;
00798 
00799     Clear();
00800 
00801     vecstrNodes.resize(vecstrDatasets.size());
00802     for(i=0; i<vecstrDatasets.size(); i++){
00803         vecstrNodes[i] = strInputDirectory + '/' + vecstrDatasets[i];
00804     }
00805 
00806     m_vecpDBs.resize( iFiles );
00807     int iNumFilesOpen = 1000;
00808 
00809     for( i = 0; i < m_vecpDBs.size( ); ++i ) {
00810         m_vecpDBs[ i ] = new CDatabaselet( m_useNibble );
00811     }
00812 
00813     size_t k;
00814 
00815     for( i = 0; i < m_vecpDBs.size( ); ++i ) { //block size (such as 1000)
00816         if(i%iNumFilesOpen==0 && i>0){
00817             for(k=0; k<iNumFilesOpen; k++){
00818                 m_vecpDBs[i-k-1]->CloseFile();
00819             }
00820         }
00821         vecstrSubset.clear( );
00822         for( j = i; j < vecstrGenes.size( ); j += m_vecpDBs.size( ) )
00823             vecstrSubset.push_back( vecstrGenes[ j ] ); //contains index for 1000, 2000, 3000th genes
00824         sprintf( acNumber, "%08lu", i );
00825         strFile = strOutputDirectory + '/' + acNumber + c_acExtension;
00826 
00827         if( !( i % 100 ) )
00828             g_CatSleipnir( ).notice( "CDatabase::Open( %s, %d ) initializing file %d/%d",
00829                 strOutputDirectory.c_str( ), iFiles, i, m_vecpDBs.size( ) );
00830         if( !m_vecpDBs[ i ]->Open( strFile, vecstrSubset, vecstrGenes.size( ), vecstrNodes.size( ) ) ) {
00831             g_CatSleipnir( ).error( "CDatabase::Open( %s, %d ) could not open file %s",
00832                 strOutputDirectory.c_str( ), iFiles, strFile.c_str( ) );
00833             return false; } }
00834 
00835     for( i = 0; i < m_vecpDBs.size( ); ++i ){
00836         m_vecpDBs[i]->CloseFile();
00837     }
00838 
00839     for( i = 0; i < vecstrGenes.size( ); ++i )
00840         m_mapstriGenes[ m_vecpDBs[ i % m_vecpDBs.size( ) ]->GetGene( i / m_vecpDBs.size( ) ) ] = i;
00841 
00842     return CDatabaseImpl::Open( vecstrGenes, vecstrNodes, mapstriZeros ); 
00843 }
00844 
00845 /* the key Open() method for Data2DB conversion */
00846 bool CDatabaseImpl::Open( const std::vector<std::string>& vecstrGenes,
00847     const std::vector<std::string>& vecstrFiles, const map<string, size_t>& mapstriZeros ) {
00848     size_t          i, j, k, iOne, iTwo, iOutBlock, iOutBase, iOutOffset, iInBlock, iInBase, iInOffset;
00849     vector<size_t>  veciGenes;
00850     float           d;
00851     map<string, size_t>::const_iterator iterZero;
00852 
00853     veciGenes.resize( vecstrGenes.size( ) );
00854     iOutBlock = ( m_iBlockOut == -1 ) ? m_vecpDBs.size( ) : m_iBlockOut;
00855     iInBlock = ( m_iBlockIn == -1 ) ? vecstrFiles.size( ) : m_iBlockIn;
00856 
00857     int iNumFilesOpen = 1000;
00858 
00859     /* blocking parameter, iOutBase: number of databaselets to process at a time */
00860     for( iOutBase = 0; iOutBase < m_vecpDBs.size( ); iOutBase += iOutBlock ) {
00861         vector<string>  vecstrMyGenes;
00862         vector<size_t>  veciMyGenes;
00863 
00864         for( iOutOffset = 0; ( iOutOffset < iOutBlock ) && ( ( iOutBase + iOutOffset ) < m_vecpDBs.size( ) );
00865             ++iOutOffset ) {
00866             const CDatabaselet& DB  = *m_vecpDBs[ iOutBase + iOutOffset ];
00867 
00868             for( i = 0; i < DB.GetGenes( ); ++i )
00869                 vecstrMyGenes.push_back( DB.GetGene( i ) );
00870         }
00871 
00872         veciMyGenes.resize( vecstrMyGenes.size( ) );
00873 
00874         for( iInBase = 0; iInBase < vecstrFiles.size( ); iInBase += iInBlock ) {
00875             vector<CCompactFullMatrix>  vecData;
00876             vecData.resize( ( ( iInBase + iInBlock ) > vecstrFiles.size( ) ) ?
00877                 ( vecstrFiles.size( ) - iInBase ) : iInBlock );
00878             for( iInOffset = 0; iInOffset < vecData.size( ); ++iInOffset ) {
00879                 CDataPair   Dat;
00880 
00881                 if( !Dat.Open( (vecstrFiles[ iInBase + iInOffset ] + c_acDAB).c_str( ),
00882                         false, m_fMemmap) ) {
00883                     if( !Dat.Open( (vecstrFiles[ iInBase + iInOffset ] + c_acQDAB).c_str( ), false, m_fMemmap ) ) {
00884                         g_CatSleipnir( ).error( "CDatabaseImpl::Open( ) could not open %s",
00885                             (vecstrFiles[ iInBase + iInOffset ] + c_acDAB).c_str( ) );
00886                         g_CatSleipnir( ).error( "CDatabaseImpl::Open( ) could not open %s",
00887                             (vecstrFiles[ iInBase + iInOffset ] + c_acQDAB).c_str( ) );
00888                         return false;
00889                     }
00890                 }
00891 
00892                 for( i = 0; i < veciMyGenes.size( ); ++i )
00893                     veciMyGenes[ i ] = Dat.GetGene( vecstrMyGenes[ i ] );
00894                 for( i = 0; i < veciGenes.size( ); ++i )
00895                     veciGenes[ i ] = Dat.GetGene( vecstrGenes[ i ] );
00896 
00897                 if(m_useNibble){
00898                     vecData[ iInOffset ].Initialize( veciMyGenes.size( ), veciGenes.size( ), 16, true );
00899                 }else{
00900                     vecData[ iInOffset ].Initialize( veciMyGenes.size( ), veciGenes.size( ), (unsigned char) 256, true );
00901                 }
00902 
00903                 string  strName = CMeta::Filename( CMeta::Deextension( CMeta::Basename( vecstrFiles[ iInBase + iInOffset ].c_str() ) ) );
00904                 size_t iZero = ( ( iterZero = mapstriZeros.find( strName ) ) == mapstriZeros.end( ) ) ? -1 : iterZero->second;
00905                 //#pragma omp parallel for \
00906                 shared(Dat, veciGenes, veciMyGenes, vecData) \
00907                 private(j, i) \
00908                 schedule(static)
00909 
00910                 for(i=0; i<veciMyGenes.size(); i++){
00911                     size_t iOne = veciMyGenes[i];
00912                     for(j=0; j<veciGenes.size(); j++){
00913                         size_t iTwo = veciGenes[j];
00914                         size_t iVal = -1;
00915                         if ( iOne != -1 && iTwo != -1 ) {
00916                             if( !CMeta::IsNaN( d = Dat.Get( iOne, iTwo ) ) )
00917                             iVal = Dat.Quantize(d); 
00918                             if ( iVal == -1 )
00919                             iVal = iZero;
00920                         }
00921                         if ( iVal != -1 )
00922                             vecData[iInOffset].Set(i,j,iVal+1); 
00923                     }
00924                 }
00925 
00926 
00927             }
00928 
00929             for( i = iOutOffset = 0; ( iOutOffset < iOutBlock ) && ( ( iOutBase + iOutOffset ) <
00930                 m_vecpDBs.size( ) ); ++iOutOffset ) {
00931                 CDatabaselet&   DB  = *m_vecpDBs[ iOutBase + iOutOffset ];
00932 
00933                 /* close files if too many file handles opened */
00934                 /*if(iOutOffset>0 && (iOutOffset % iNumFilesOpen==0 ||
00935                     (iOutOffset + 1) >= iOutBlock || (iOutBase + iOutOffset + 1) >= m_vecpDBs.size() ) ){
00936                     for(k=0; k<iNumFilesOpen; k++){
00937                         if(iOutOffset + iOutBase - k - 1 < 0){
00938                             break;
00939                         }
00940                         m_vecpDBs[iOutOffset + iOutBase - 1 - k]->CloseFile();
00941                     }
00942                 }*/
00943 
00944                 DB.OpenNoOverwrite();
00945 
00946                 if( !( iOutOffset % 100 ) )
00947                     cerr << "Processing offset " << iOutOffset << '/' << iOutBlock << endl;
00948                 //iInBase, for B=2, iInBase = 0, 2, 4 (total 5 datasets)
00949                 //i = 0 18 36 ...
00950                 if( !DB.Open( vecData, i, iInBase, m_fBuffer ) ){
00951                     return false;
00952                 }
00953 
00954                 i += DB.GetGenes( );
00955 
00956                 DB.CloseFile();
00957             }
00958         }
00959     }
00960 
00961     return true;
00962 }
00963 
00964 bool CDatabase::Open( const std::string& strInputDirectory ) {
00965     size_t          i, j;
00966     vector<string>  vecstrFiles;
00967     string          strFile;
00968 
00969     FOR_EACH_DIRECTORY_FILE(strInputDirectory, strFile)
00970         if( !CMeta::IsExtension( strFile, c_acExtension ) )
00971             continue;
00972 
00973         i = atoi( CMeta::Deextension( CMeta::Basename( strFile.c_str( ) ) ).c_str( ) );
00974         if( vecstrFiles.size( ) <= i )
00975             vecstrFiles.resize( i + 1 );
00976         else if( vecstrFiles[ i ].length( ) != 0 ) {
00977             g_CatSleipnir( ).error( "CDatabase::Open( %s ) duplicate file: %s (%d)", strInputDirectory.c_str( ),
00978                 strFile.c_str( ), i );
00979             return false; }
00980         vecstrFiles[ i ] = strInputDirectory + '/' + strFile; }
00981 
00982     Clear( );
00983     m_vecpDBs.resize( vecstrFiles.size( ) );
00984     for( i = 0; i < m_vecpDBs.size( ); ++i ) {
00985         m_vecpDBs[ i ] = new CDatabaselet( m_useNibble);
00986         if( !m_vecpDBs[ i ]->Open( vecstrFiles[ i ] ) )
00987             return false;
00988         for( j = 0; j < m_vecpDBs[ i ]->GetGenes( ); ++j )
00989             m_mapstriGenes[ m_vecpDBs[ i ]->GetGene( j ) ] = ( j * m_vecpDBs.size( ) ) + i; }
00990 
00991     return true;
00992 }
00993 
00994 bool CDatabaselet::Set(uint32_t &iGenes, uint32_t &iDatasets, vector<string> &vecstrSubsetGenes){
00995     size_t i;
00996     m_vecstrGenes.clear();
00997     m_vecstrGenes.resize(vecstrSubsetGenes.size());
00998     for(i=0; i<vecstrSubsetGenes.size(); i++){
00999         m_vecstrGenes[i] = vecstrSubsetGenes[i];
01000     }
01001 
01002     m_iGenes = iGenes;
01003     m_iDatasets = iDatasets;
01004     uint32_t iSize = m_vecstrGenes.size( );
01005     m_iHeader = sizeof(m_iHeader) + sizeof(m_iGenes) + sizeof(m_iDatasets) + sizeof(iSize);
01006     for( i = 0; i < m_vecstrGenes.size( ); ++i ) {
01007         m_iHeader += m_vecstrGenes[ i ].size( ) + 1;
01008     }
01009 
01010     return true;
01011 }
01012 
01013 //Create a copy of current CDatabase collection that has X number of CDatabaselets 
01014 bool CDatabase::Reorganize(const char *dest_db_dir, const size_t &num_db){
01015     int dest_db = num_db;
01016     size_t i, j, l;
01017     const char c_acExtension[] = ".db";
01018     char acNumber[16];
01019 
01020     vector<string> vecstrG;
01021     vecstrG.resize(m_mapstriGenes.size());
01022 
01023     for(map<string,size_t>::iterator iter=m_mapstriGenes.begin();
01024         iter!=m_mapstriGenes.end(); iter++){
01025         string first = iter->first;
01026         size_t second = iter->second;
01027         vecstrG[second] = first;
01028     }
01029 
01030     for(i=0; i<dest_db; i++){
01031         vector<string> vecstrSubset;
01032         for(j=i; j<vecstrG.size(); j+=dest_db)
01033             vecstrSubset.push_back(vecstrG[j]);
01034         //size of this db
01035         size_t iSize = GetGenes() * GetDatasets() * vecstrSubset.size();
01036 
01037         unsigned char *Qt = (unsigned char*)malloc(iSize);
01038         int tot = 0;
01039         for(j=0; j<vecstrSubset.size(); j++){
01040             size_t k = m_mapstriGenes.find(vecstrSubset[j])->second;
01041             vector<unsigned char> Qi;
01042             if(!GetGene(k, Qi)){
01043                 cerr << "Gene error" << endl;
01044                 continue;
01045             }
01046             for(l=0; l<Qi.size(); l++)
01047                 Qt[tot+l] = Qi[l];
01048             tot+=Qi.size();
01049         }
01050         sprintf(acNumber, "%08lu", i);
01051         string dest_dir = dest_db_dir;
01052         string strFile = dest_dir + "/" + acNumber + c_acExtension;
01053 
01054         CDatabaselet DBS(false);
01055         DBS.Open(strFile.c_str(), vecstrSubset, vecstrG.size(), 
01056             GetDatasets());
01057         DBS.Write((char*) Qt, iSize, 0);
01058         free(Qt);
01059     }
01060     return true;
01061 }
01062 
01063 //For SeekMiner
01064 bool CDatabase::Open(const string &strDBDirectory,
01065         const vector<string> &vecstrGenes, const size_t &iDatasets, const size_t &iNumDBs){
01066     return CDatabase::Open(strDBDirectory.c_str(), vecstrGenes, iDatasets, iNumDBs);
01067 }
01068 
01069 //For SeekMiner
01070 bool CDatabase::Open(const char *db_dir, const vector<string> &vecstrGenes, 
01071     const size_t &iDatasets, const size_t &iNumDBs){
01072     size_t i, j, k;
01073     Clear();
01074     m_vecpDBs.resize(iNumDBs);
01075     char acNumber[ 16 ];
01076 
01077     for( i = 0; i < m_vecpDBs.size( ); ++i )
01078         m_vecpDBs[ i ] = new CDatabaselet( m_useNibble );
01079 
01080     string strDBDirectory = db_dir;
01081 
01082     for(i=0; i<iNumDBs; i++){
01083         vector<string> vecstrSubset;
01084         vecstrSubset.clear( );
01085         for( j = i; j < vecstrGenes.size( ); j += m_vecpDBs.size( ) )
01086             vecstrSubset.push_back( vecstrGenes[ j ] ); //contains index for 1000, 2000, 3000th genes
01087         sprintf( acNumber, "%08lu", i );
01088         string strFile = strDBDirectory + '/' + acNumber + c_acExtension;
01089         uint32_t iGenes = vecstrGenes.size();
01090         uint32_t iDset = iDatasets;
01091         m_vecpDBs[i]->Set(iGenes, iDset, vecstrSubset);
01092         m_vecpDBs[i]->SetFile(strFile);
01093     }
01094 
01095     for( i = 0; i < vecstrGenes.size( ); ++i ){
01096         string s = m_vecpDBs[ i % m_vecpDBs.size( ) ]->GetGene( i / m_vecpDBs.size( ) );
01097         string file = m_vecpDBs[i % m_vecpDBs.size()]->GetFile();
01098         //fprintf(stderr, "%s\t%s\t%d\n", s.c_str(), file.c_str(), i);
01099         m_mapstriGenes[ m_vecpDBs[ i % m_vecpDBs.size( ) ]->GetGene( i / m_vecpDBs.size( ) ) ] = i;
01100     }
01101     //fprintf(stderr, "Done\n"); getchar();
01102 
01103     return true;
01104 }
01105 
01106 }