Sleipnir
tools/DataDumper/DataDumper.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 static const char   c_acDab[]   = ".dab";
00026 
00027 int main( int iArgs, char** aszArgs ) {
00028     gengetopt_args_info sArgs;
00029     size_t                      i, j, k, iPairs, iPair, iArg;
00030     map<string,size_t>          mapZeros;
00031     vector<string>              vecstrNames;
00032     CDataPair                   Answers;
00033     CFullMatrix<unsigned char>  MatData;
00034     vector<size_t>              veciGenes;
00035     float                       d;
00036 
00037     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00038         cmdline_parser_print_help( );
00039         return 1; }
00040     CMeta Meta( sArgs.verbosity_arg );
00041 
00042     if( sArgs.zeros_arg ) {
00043         ifstream        ifsm;
00044         vector<string>  vecstrZeros;
00045         char            acLine[ 1024 ];
00046 
00047         ifsm.open( sArgs.zeros_arg );
00048         if( !ifsm.is_open( ) ) {
00049             cerr << "Couldn't open: " << sArgs.zeros_arg << endl;
00050             return 1; }
00051         while( !ifsm.eof( ) ) {
00052             ifsm.getline( acLine, ARRAYSIZE(acLine) - 1 );
00053             acLine[ ARRAYSIZE(acLine) - 1 ] = 0;
00054             vecstrZeros.clear( );
00055             CMeta::Tokenize( acLine, vecstrZeros );
00056             if( vecstrZeros.empty( ) )
00057                 continue;
00058             mapZeros[ vecstrZeros[ 0 ] ] = atoi( vecstrZeros[ 1 ].c_str( ) ); } }
00059 
00060     if( !Answers.Open( sArgs.answers_arg, false ) ) {
00061         cerr << "Couldn't open: " << sArgs.answers_arg << endl;
00062         return 1; }
00063     if( sArgs.genes_arg && !Answers.FilterGenes( sArgs.genes_arg, CDat::EFilterInclude ) ) {
00064         cerr << "Couldn't open: " << sArgs.genes_arg << endl;
00065         return 1; }
00066     if( sArgs.genet_arg && !Answers.FilterGenes( sArgs.genet_arg, CDat::EFilterTerm ) ) {
00067         cerr << "Couldn't open: " << sArgs.genet_arg << endl;
00068         return 1; }
00069     if( sArgs.genex_arg && !Answers.FilterGenes( sArgs.genex_arg, CDat::EFilterExclude ) ) {
00070         cerr << "Couldn't open: " << sArgs.genex_arg << endl;
00071         return 1; }
00072 
00073     for( iPairs = i = 0; i < Answers.GetGenes( ); ++i )
00074         for( j = ( i + 1 ); j < Answers.GetGenes( ); ++j )
00075             if( !CMeta::IsNaN( Answers.Get( i, j ) ) )
00076                 iPairs++;
00077     MatData.Initialize( iPairs, sArgs.inputs_num );
00078     MatData.Clear( );
00079 
00080     veciGenes.resize( Answers.GetGenes( ) );
00081     for( iArg = 0; iArg < sArgs.inputs_num; ++iArg ) {
00082         CDatasetCompact                     Data;
00083         size_t                              iOne, iTwo, iZero, iVal;
00084         map<string,size_t>::const_iterator  iterZero;
00085 
00086         vecstrNames.clear( );
00087         vecstrNames.push_back( sArgs.inputs[ iArg ] );
00088         if( !Data.Open( Answers, vecstrNames, true ) ) {
00089             cerr << "Couldn't open: " << sArgs.inputs[ iArg ] << endl;
00090             return 1; }
00091         vecstrNames[ 0 ] = CMeta::Filename( CMeta::Deextension( CMeta::Basename( vecstrNames[ 0 ].c_str( ) ) ) );
00092         iZero = ( ( iterZero = mapZeros.find( vecstrNames[ 0 ] ) ) == mapZeros.end( ) ) ? -1 :
00093             iterZero->second;
00094         for( i = 0; i < veciGenes.size( ); ++i )
00095             veciGenes[ i ] = Data.GetGene( Answers.GetGene( i ) );
00096         for( iPair = i = 0; i < veciGenes.size( ); ++i ) {
00097             iOne = veciGenes[ i ];
00098             for( j = ( i + 1 ); j < veciGenes.size( ); ++j ) {
00099                 if( CMeta::IsNaN( Answers.Get( i, j ) ) )
00100                     continue;
00101                 if( ( iOne != -1 ) && ( ( iTwo = veciGenes[ j ] ) != -1 ) ) {
00102                     iVal = Data.GetDiscrete( iOne, iTwo, 1 );
00103                     if( ( iVal != -1 ) || ( ( iVal = iZero ) != -1 ) || ( sArgs.zero_flag && !( iVal = 0 ) ) )
00104                         MatData.Set( iPair, iArg, (unsigned char)( iVal + 1 ) ); }
00105                 iPair++; } } }
00106 
00107     cout << "Gene 1 Gene 2  Answer";
00108     for( i = 0; i < sArgs.inputs_num; ++i )
00109         cout << '\t' << sArgs.inputs[ i ];
00110     cout << endl;
00111     for( iPair = i = 0; i < Answers.GetGenes( ); ++i )
00112         for( j = ( i + 1 ); j < Answers.GetGenes( ); ++j ) {
00113             if( CMeta::IsNaN( d = Answers.Get( i, j ) ) )
00114                 continue;
00115             cout << Answers.GetGene( i ) << '\t' << Answers.GetGene( j ) << '\t' << d;
00116             for( k = 0; k < MatData.GetColumns( ); ++k )
00117                 cout << '\t' << ( (int)MatData.Get( iPair, k ) - 1 );
00118             cout << endl;
00119             iPair++; }
00120 
00121     return 0; }