Sleipnir
tools/DBCombiner/DBCombiner.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 
00026 int main( int iArgs, char** aszArgs ) {
00027     static const size_t c_iBuffer   = 1024;
00028 #ifdef WIN32
00029     pthread_win32_process_attach_np( );
00030 #endif // WIN32
00031     gengetopt_args_info sArgs;
00032     ifstream            ifsm;
00033     istream*            pistm;
00034     vector<string>      vecstrLine, vecstrGenes, vecstrDBs;
00035     char                acBuffer[ c_iBuffer ];
00036     size_t              i;
00037 
00038     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00039         cmdline_parser_print_help( );
00040         return 1; }
00041 
00042     if( sArgs.input_arg ) {
00043         ifsm.open( sArgs.input_arg );
00044         pistm = &ifsm; }
00045     else
00046         pistm = &cin;
00047     while( !pistm->eof( ) ) {
00048         pistm->getline( acBuffer, c_iBuffer - 1 );
00049         acBuffer[ c_iBuffer - 1 ] = 0;
00050         vecstrLine.clear( );
00051         CMeta::Tokenize( acBuffer, vecstrLine );
00052         if( vecstrLine.size( ) < 2 ) {
00053             cerr << "Ignoring line: " << acBuffer << endl;
00054             continue; }
00055         if( !( i = atoi( vecstrLine[ 0 ].c_str( ) ) ) ) {
00056             cerr << "Illegal gene ID: " << vecstrLine[ 0 ] <<
00057                 " for " << vecstrLine[ 1 ] << endl;
00058             return 1; }
00059         i--;
00060         if( vecstrGenes.size( ) <= i )
00061             vecstrGenes.resize( i + 1 );
00062         vecstrGenes[ i ] = vecstrLine[ 1 ]; }
00063     if( sArgs.input_arg )
00064         ifsm.close( );
00065 
00066     bool useNibble = false;
00067     if(sArgs.is_nibble_flag==1){
00068         useNibble = true;
00069     }
00070 
00071     if(sArgs.reorganize_flag==1){
00072         vector<string> vecstrDataset;
00073         ifstream ifsm2;
00074         ifsm2.open(sArgs.dataset_arg);
00075         while(!ifsm2.eof()){
00076             ifsm2.getline(acBuffer, c_iBuffer-1);
00077             if(acBuffer[0]==0) break;
00078             acBuffer[c_iBuffer-1] = 0;
00079             vector<string> vecstrLine;
00080             CMeta::Tokenize(acBuffer, vecstrLine);
00081             vecstrDataset.push_back(vecstrLine[0]);
00082         }
00083         ifsm2.close();
00084 
00085         if(useNibble){
00086             fprintf(stderr, "The use of nibble flag is not supported for --reorganize mode\n");
00087             return 1;
00088         }
00089         CDatabase db(false);
00090         db.Open(sArgs.db_dir_arg, vecstrGenes, vecstrDataset.size(), 
00091             sArgs.src_db_num_arg);
00092         db.Reorganize(sArgs.dest_db_dir_arg, sArgs.dest_db_num_arg);
00093         return 0;
00094     }
00095 
00096     if(sArgs.combine_flag==1){
00097         CDatabase DB(useNibble);
00098 
00099         bool fSplit = false;
00100         if(sArgs.split_flag==1){
00101             fSplit = true;
00102         }
00103 
00104         if(sArgs.db_arg){
00105             ifsm.open(sArgs.db_arg);
00106             while(!pistm->eof()){
00107                 pistm->getline(acBuffer, c_iBuffer -1);
00108                 if(acBuffer[0]==0){
00109                     break;
00110                 }
00111                 acBuffer[c_iBuffer-1] = 0;
00112                 vecstrDBs.push_back(acBuffer);
00113             }
00114             vecstrDBs.resize(vecstrDBs.size());
00115             ifsm.close();
00116 
00117             //printf("Reading DBS"); getchar();
00118             vector<CDatabaselet*> DBS;
00119             DBS.resize(vecstrDBs.size());
00120             for(i=0; i<vecstrDBs.size(); i++){
00121                 DBS[i] = new CDatabaselet(useNibble);
00122                 DBS[i]->Open(vecstrDBs[i]);
00123             }
00124             //printf("Finished reading DBS"); getchar();
00125 
00126             CDatabaselet::Combine(DBS, sArgs.dir_out_arg, vecstrGenes, fSplit);
00127             for(i=0; i<vecstrDBs.size(); i++){
00128                 free(DBS[i]);
00129             }
00130 
00131         }else{
00132             cerr << "Must give a db list." << endl;
00133             return 1;
00134 
00135         }
00136     }
00137 #ifdef WIN32
00138     pthread_win32_process_detach_np( );
00139 #endif // WIN32
00140     return 0; }