Sleipnir
tools/Data2DB/Data2DB.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 
00026 int main( int iArgs, char** aszArgs ) {
00027     static const size_t c_iBuffer   = 1024;
00028 #ifdef WIN32
00029     pthread_win32_process_attach_np( );
00030 #endif // WIN32
00031     gengetopt_args_info sArgs;
00032     ifstream            ifsm;
00033     istream*            pistm;
00034     vector<string>      vecstrLine, vecstrGenes, vecstrDatasets;
00035     char                acBuffer[ c_iBuffer ];
00036     CBayesNetSmile      BNSmile;
00037     size_t              i;
00038     map<string, size_t> mapstriZeros;
00039 
00040     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00041         cmdline_parser_print_help( );
00042         return 1; }
00043     CMeta Meta( sArgs.verbosity_arg );
00044 
00045     if( sArgs.input_arg ) {
00046         ifsm.open( sArgs.input_arg );
00047         pistm = &ifsm; }
00048     else
00049         pistm = &cin;
00050     while( !pistm->eof( ) ) {
00051         pistm->getline( acBuffer, c_iBuffer - 1 );
00052         acBuffer[ c_iBuffer - 1 ] = 0;
00053         vecstrLine.clear( );
00054         CMeta::Tokenize( acBuffer, vecstrLine );
00055         if( vecstrLine.size( ) < 2 ) {
00056             cerr << "Ignoring line: " << acBuffer << endl;
00057             continue; }
00058         if( !( i = atoi( vecstrLine[ 0 ].c_str( ) ) ) ) {
00059             cerr << "Illegal gene ID: " << vecstrLine[ 0 ] << " for " << vecstrLine[ 1 ] << endl;
00060             return 1; }
00061         i--;
00062         if( vecstrGenes.size( ) <= i )
00063             vecstrGenes.resize( i + 1 );
00064         vecstrGenes[ i ] = vecstrLine[ 1 ]; }
00065     if( sArgs.input_arg )
00066         ifsm.close( );
00067 
00068     if( sArgs.zeros_arg ) {
00069         ifstream        ifsm_zero;
00070         vector<string>  vecstrLine;
00071         char            acLine[ 1024 ];
00072 
00073         ifsm_zero.open( sArgs.zeros_arg );
00074         if( !ifsm_zero.is_open( ) ) {
00075             cerr << "Couldn't open: " << sArgs.zeros_arg << endl;
00076             return 1;
00077         }
00078         while( !ifsm_zero.eof( ) ) {
00079             ifsm_zero.getline( acLine, ARRAYSIZE(acLine) - 1 );
00080             acLine[ ARRAYSIZE(acLine) - 1 ] = 0;
00081             vecstrLine.clear( );
00082             CMeta::Tokenize( acLine, vecstrLine );
00083             if( vecstrLine.empty( ) )
00084             continue;
00085             mapstriZeros[ vecstrLine[ 0 ] ] = atoi( vecstrLine[ 1 ].c_str( ) );
00086         }
00087     }
00088 
00089 
00090     bool useNibble = false;
00091     if(sArgs.use_nibble_flag==1){
00092         useNibble = true;
00093     }
00094 
00095     CDatabase DB(useNibble);
00096     DB.SetMemmap( !!sArgs.memmap_flag );
00097     DB.SetBuffer( !!sArgs.buffer_flag );
00098     DB.SetBlockOut( sArgs.block_files_arg );
00099     DB.SetBlockIn( sArgs.block_datasets_arg );
00100 
00101     if(sArgs.network_arg){
00102         if(sArgs.dataset_arg){
00103             cerr << "Confused. Only network OR dataset list." << endl;
00104             return 1;
00105         }
00106 
00107         if( !BNSmile.Open( sArgs.network_arg ) ) {
00108             cerr << "Could not open: " << sArgs.network_arg << endl;
00109             return 1; }
00110         if( !DB.Open( vecstrGenes, sArgs.dir_in_arg, &BNSmile, sArgs.dir_out_arg, min((size_t)sArgs.files_arg,
00111             vecstrGenes.size( )), mapstriZeros ) ) {
00112             cerr << "Could not open data" << endl;
00113             return 1;
00114         }
00115 
00116     }else if(sArgs.dataset_arg){
00117 
00118         ifsm.open(sArgs.dataset_arg);
00119         while(!pistm->eof()){
00120             pistm->getline(acBuffer, c_iBuffer -1);
00121             if(acBuffer[0]==0)
00122                 break;
00123             acBuffer[c_iBuffer-1] = 0;
00124             //If line contains multiple columns,
00125             //use the first column, which is the dataset column
00126             vector<string> tok;
00127             CMeta::Tokenize(acBuffer, tok, " \t");
00128             vecstrDatasets.push_back(tok[0]);
00129         }
00130         vecstrDatasets.resize(vecstrDatasets.size());
00131         ifsm.close();
00132 
00133         if( !DB.Open( vecstrGenes, vecstrDatasets, sArgs.dir_in_arg, sArgs.dir_out_arg, min((size_t)sArgs.files_arg,
00134             vecstrGenes.size( )), mapstriZeros ) ) {
00135             cerr << "Could not open data" << endl;
00136             return 1;
00137         }
00138 
00139     }else{
00140         cerr << "Must give a network or a dataset list." << endl;
00141         return 1;
00142 
00143     }
00144 
00145 #ifdef WIN32
00146     pthread_win32_process_detach_np( );
00147 #endif // WIN32
00148     return 0; }