Sleipnir
tools/Dab2Dad/Dab2Dad.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 int main( int iArgs, char** aszArgs ) {
00026     gengetopt_args_info sArgs;
00027     CDatasetCompact     Data;
00028     CDataset            DataContinuous;
00029     IDataset*           pData;
00030     CGenome             Genome;
00031     CGenes              GenesIn( Genome ), GenesEx( Genome );
00032     ifstream            ifsm;
00033     size_t              i, j, k, iOne, iTwo;
00034     float               d;
00035 
00036     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00037         cmdline_parser_print_help( );
00038         return 1; }
00039     CMeta Meta( sArgs.verbosity_arg );
00040 
00041     if( sArgs.load_arg ) {
00042         CDatasetCompactMap  DataMap;
00043 
00044         if( !DataMap.Open( sArgs.load_arg ) ) {
00045             cerr << "Couldn't open: " << sArgs.load_arg << endl;
00046             return 1; }
00047 #ifdef _MSC_VER
00048         Sleep( INFINITE );
00049 #else // _MSC_VER
00050         sleep( -1 );
00051 #endif // _MSC_VER
00052         return 0; }
00053 
00054     if( sArgs.genes_arg ) {
00055         ifsm.open( sArgs.genes_arg );
00056         if( !GenesIn.Open( ifsm ) ) {
00057             cerr << "Couldn't open: " << sArgs.genes_arg << endl;
00058             return 1; }
00059         ifsm.close( ); }
00060     if( sArgs.genex_arg ) {
00061         ifsm.clear( );
00062         ifsm.open( sArgs.genex_arg );
00063         if( !GenesEx.Open( ifsm ) ) {
00064             cerr << "Couldn't open: " << sArgs.genex_arg << endl;
00065             return 1; }
00066         ifsm.close( ); }
00067 
00068     if( sArgs.input_arg ) {
00069         ifsm.clear( );
00070         ifsm.open( sArgs.input_arg, ios_base::binary );
00071         if( !Data.Open( ifsm ) ) {
00072             cerr << "Couldn't open: " << sArgs.input_arg << endl;
00073             return 1; }
00074         ifsm.close( ); }
00075     else if( sArgs.network_arg ) {
00076         CDataPair       Answers;
00077         CBayesNetSmile  BNSmile;
00078 
00079         if( !sArgs.answers_arg ) {
00080             cmdline_parser_print_help( );
00081             return 1; }
00082         if( !BNSmile.Open( sArgs.network_arg ) ) {
00083             cerr << "Couldn't open: " << sArgs.network_arg << endl;
00084             return 1; }
00085         if( !Answers.Open( sArgs.answers_arg, false ) ) {
00086             cerr << "Couldn't open: " << sArgs.answers_arg << endl;
00087             return 1; }
00088         if( !Data.Open( Answers, sArgs.directory_arg, &BNSmile, GenesIn, GenesEx, !!sArgs.everything_flag ) ) {
00089             cerr << "Couldn't open: " << sArgs.directory_arg << endl;
00090             return 1; } }
00091     else if( sArgs.lookupp_arg ) {
00092         CDat            DatLookup;
00093         vector<size_t>  veciGenes;
00094         CPCL            PCLLookup;
00095         vector<string>  vecstrNames, vecstrExperiments, vecstrDummy;
00096 
00097         ifsm.clear( );
00098         ifsm.open( sArgs.lookupp_arg );
00099         if( !DatLookup.Open( ifsm, CDat::EFormatText, 1 ) ) {
00100             cerr << "Couldn't open: " << sArgs.lookupp_arg << endl;
00101             return 1; }
00102         ifsm.close( );
00103 
00104         for( i = 0; i < DatLookup.GetGenes( ); ++i )
00105             for( j = ( i + 1 ); j < DatLookup.GetGenes( ); ++j )
00106                 if( !CMeta::IsNaN( DatLookup.Get( i, j ) ) )
00107                     vecstrNames.push_back( DatLookup.GetGene( i ) + " - " + DatLookup.GetGene( j ) );
00108         vecstrExperiments.resize( sArgs.inputs_num );
00109         copy( sArgs.inputs, sArgs.inputs + sArgs.inputs_num, vecstrExperiments.begin( ) );
00110         PCLLookup.Open( vecstrNames, vecstrExperiments, vecstrDummy );
00111 
00112         veciGenes.resize( DatLookup.GetGenes( ) );
00113         for( i = 0; i < sArgs.inputs_num; ++i ) {
00114             CDataPair   Dat;
00115             size_t      iGene;
00116 
00117             if( !Dat.Open( sArgs.inputs[ i ], false, !!sArgs.memmap_flag ) ) {
00118                 cerr << "Couldn't open: " << sArgs.inputs[ i ] << endl;
00119                 return 1; }
00120             for( j = 0; j < veciGenes.size( ); ++j )
00121                 veciGenes[ j ] = Dat.GetGene( DatLookup.GetGene( j ) );
00122             for( iGene = j = 0; j < DatLookup.GetGenes( ); ++j ) {
00123                 iOne = veciGenes[ j ];
00124                 for( k = ( j + 1 ); k < DatLookup.GetGenes( ); ++k ) {
00125                     if( CMeta::IsNaN( DatLookup.Get( j, k ) ) )
00126                         continue;
00127                     iTwo = veciGenes[ k ];
00128                     if( ( iOne != -1 ) && ( iTwo != -1 ) )
00129                         PCLLookup.Set( iGene, i, Dat.Get( iOne, iTwo ) );
00130                     iGene++; } } }
00131         PCLLookup.Save( cout );
00132         return 0; }
00133     else if( sArgs.lookups_arg ) {
00134         CGenes          GenesLk( Genome );
00135         vector<size_t>  veciGenes;
00136 
00137         ifsm.clear( );
00138         ifsm.open( sArgs.lookups_arg );
00139         if( !GenesLk.Open( ifsm ) ) {
00140             cerr << "Couldn't open: " << sArgs.lookups_arg << endl;
00141             return 1; }
00142         ifsm.close( );
00143 
00144         cout << "GID";
00145         if( sArgs.lookup1_arg )
00146             for( i = 0; i < GenesLk.GetGenes( ); ++i )
00147                 cout << '\t' << GenesLk.GetGene( i ).GetName( );
00148         else
00149             for( i = 0; i < GenesLk.GetGenes( ); ++i ) {
00150                 const string&   strOne  = GenesLk.GetGene( i ).GetName( );
00151                 for( j = ( i + 1 ); j < GenesLk.GetGenes( ); ++j )
00152                     cout << '\t' << strOne << '-' << GenesLk.GetGene( j ).GetName( ); }
00153         cout << endl;
00154 
00155         veciGenes.resize( GenesLk.GetGenes( ) );
00156         for( i = 0; i < sArgs.inputs_num; ++i ) {
00157             CDataPair   Dat;
00158 
00159             if( !Dat.Open( sArgs.inputs[ i ], false, !!sArgs.memmap_flag ) ) {
00160                 cerr << "Couldn't open: " << sArgs.inputs[ i ] << endl;
00161                 return 1; }
00162             for( j = 0; j < GenesLk.GetGenes( ); ++j )
00163                 veciGenes[ j ] = Dat.GetGene( GenesLk.GetGene( j ).GetName( ) );
00164             cout << sArgs.inputs[ i ];
00165             if( sArgs.lookup1_arg ) {
00166                 if( ( iOne = Dat.GetGene( sArgs.lookup1_arg ) ) != -1 )
00167                     for( j = 0; j < veciGenes.size( ); ++j ) {
00168                         cout << '\t';
00169                         if( ( ( iTwo = veciGenes[ j ] ) != -1 ) &&
00170                             !CMeta::IsNaN( d = Dat.Get( iOne, iTwo ) ) )
00171                             cout << d; } }
00172             else
00173                 for( j = 0; j < veciGenes.size( ); ++j ) {
00174                     iOne = veciGenes[ j ];
00175                     for( k = ( j + 1 ); k < veciGenes.size( ); ++k ) {
00176                         cout << '\t';
00177                         if( ( iOne != -1 ) && ( ( iTwo = veciGenes[ k ] ) != -1 ) &&
00178                             !CMeta::IsNaN( d = Dat.Get( iOne, iTwo ) ) )
00179                             cout << Dat.Get( iOne, iTwo ); } }
00180             cout << endl; }
00181         return 0; }
00182     else if( sArgs.lookup1_arg && sArgs.lookup2_arg ) {
00183         for( i = 0; i < sArgs.inputs_num; ++i ) {
00184             CDataPair   Dat;
00185 
00186             if( !Dat.Open( sArgs.inputs[ i ], false, !!sArgs.memmap_flag ) ) {
00187                 cerr << "Couldn't open: " << sArgs.inputs[ i ] << endl;
00188                 return 1; }
00189             cout << sArgs.inputs[ i ];
00190             if( ( ( iOne = Dat.GetGene( sArgs.lookup1_arg ) ) != -1 ) &&
00191                 ( ( iTwo = Dat.GetGene( sArgs.lookup2_arg ) ) != -1 ) &&
00192                 !CMeta::IsNaN( d = Dat.Get( iOne, iTwo ) ) )
00193                 cout << '\t' << ( sArgs.quantize_flag ? Dat.Quantize( d ) : d );
00194             cout << endl; }
00195         return 0; }
00196     else if( sArgs.paircount_arg != -1 ) {
00197         CHalfMatrix<size_t> DatCounts;
00198 
00199         if( !GenesIn.GetGenes( ) ) {
00200             cerr << "Pair count requires gene list -g" << endl;
00201             return 1; }
00202         DatCounts.Initialize( GenesIn.GetGenes( ) );
00203         DatCounts.Clear( );
00204         for( i = 0; i < sArgs.inputs_num; ++i ) {
00205             CDat            Dat;
00206             vector<size_t>  veciGenes;
00207 
00208             if( !Dat.Open( sArgs.inputs[ i ], !!sArgs.memmap_flag ) ) {
00209                 cerr << "Couldn't open: " << sArgs.inputs[ i ] << endl;
00210                 return 1; }
00211             veciGenes.resize( Dat.GetGenes( ) );
00212             for( j = 0; j < veciGenes.size( ); ++j )
00213                 veciGenes[ j ] = GenesIn.GetGene( Dat.GetGene( j ) );
00214             for( j = 0; j < veciGenes.size( ); ++j ) {
00215                 if( ( iOne = veciGenes[ j ] ) == -1 )
00216                     continue;
00217                 for( k = ( j + 1 ); k < veciGenes.size( ); ++k )
00218                     if( ( ( iTwo = veciGenes[ k ] ) != -1 ) && !CMeta::IsNaN( Dat.Get( j, k ) ) )
00219                         DatCounts.Get( iOne, iTwo )++; } }
00220         for( i = 0; i < DatCounts.GetSize( ); ++i )
00221             for( j = ( i + 1 ); j < DatCounts.GetSize( ); ++j )
00222                 if( DatCounts.Get( i, j ) > (size_t)sArgs.paircount_arg )
00223                     cout << GenesIn.GetGene( i ).GetName( ) << '\t' << GenesIn.GetGene( j ).GetName( ) <<
00224                         '\t' << DatCounts.Get( i, j ) << endl; }
00225     else {
00226         vector<string>  vecstrFiles;
00227         bool            fOpen;
00228 
00229         vecstrFiles.resize( sArgs.inputs_num );
00230         copy( sArgs.inputs, sArgs.inputs + sArgs.inputs_num, vecstrFiles.begin( ) );
00231         fOpen = sArgs.continuous_flag ? DataContinuous.Open( vecstrFiles ) : Data.Open( vecstrFiles );
00232         if( !fOpen ) {
00233             cerr << "Couldn't open inputs" << endl;
00234             return 1; }
00235         pData = sArgs.continuous_flag ? (IDataset*)&DataContinuous : (IDataset*)&Data;
00236         if( GenesIn.GetGenes( ) )
00237             pData->FilterGenes( GenesIn, CDat::EFilterInclude );
00238         if( GenesEx.GetGenes( ) )
00239             pData->FilterGenes( GenesEx, CDat::EFilterExclude ); }
00240 
00241     if( sArgs.mask_arg ) {
00242         CDat        Mask;
00243         vector<int> veciGenes;
00244 
00245         if( !Mask.Open( sArgs.mask_arg ) ) {
00246             cerr << "Couldn't open: " << sArgs.mask_arg << endl;
00247             return 1; }
00248         veciGenes.resize( pData->GetGenes( ) );
00249         for( i = 0; i < pData->GetGenes( ); ++i )
00250             veciGenes[ i ] = (int)Mask.GetGene( pData->GetGene( i ) );
00251         for( i = 0; i < pData->GetGenes( ); ++i ) {
00252             if( ( iOne = veciGenes[ i ] ) == -1 ) {
00253                 for( j = ( i + 1 ); j < pData->GetGenes( ); ++j )
00254                     pData->Remove( i, j );
00255                 continue; }
00256             for( j = ( i + 1 ); j < pData->GetGenes( ); ++j )
00257                 if( ( ( iTwo = veciGenes[ j ] ) == -1 ) ||
00258                     CMeta::IsNaN( d = Mask.Get( iOne, iTwo ) ) || ( d <= 0 ) )
00259                     pData->Remove( i, j ); } }
00260 
00261     if( sArgs.output_arg ) {
00262         ofstream    ofsm;
00263 
00264         ofsm.open( sArgs.output_arg, ios_base::binary );
00265         pData->Save( ofsm, true );
00266         ofsm.close( ); }
00267     else
00268         pData->Save( cout, false );
00269 
00270     return 0; }