Sleipnir
tools/Txt2Bin/Txt2Bin.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 const char  c_szTxt[]   = "txt";
00026 const char  c_szBin[]   = "bin";
00027 const char  c_szDat[]   = "dat";
00028 
00029 bool OpenBin( istream&, ostream& );
00030 bool OpenMat( istream&, bool, ostream&, bool );
00031 bool OpenText( istream&, ostream& );
00032 bool OpenDats( const CDataPair&, const vector<string>&, ostream&, const CGenes&,
00033     const CGenes& );
00034 
00035 int main( int iArgs, char** aszArgs ) {
00036     gengetopt_args_info sArgs;
00037     ifstream            ifsm;
00038     vector<string>      vecstrDats;
00039     size_t              i;
00040     ofstream            ofsm;
00041     CGenome             Genome;
00042     CGenes              GenesEx( Genome ), GenesIn( Genome );
00043     CDataPair           Answers;
00044 
00045     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00046         cmdline_parser_print_help( );
00047         return 1; }
00048     CMeta Meta( sArgs.verbosity_arg );
00049 
00050     if( sArgs.genex_arg ) {
00051         ifsm.open( sArgs.genex_arg );
00052         GenesEx.Open( ifsm );
00053         ifsm.close( ); }
00054     if( sArgs.genes_arg ) {
00055         ifsm.clear( );
00056         ifsm.open( sArgs.genes_arg );
00057         GenesIn.Open( ifsm );
00058         ifsm.close( ); }
00059 
00060     ifsm.clear( );
00061     if( sArgs.matrix_flag ) {
00062         bool    fBinIn, fBinOut;
00063 
00064         fBinIn = !strcmp( sArgs.from_arg, c_szBin );
00065         fBinOut = !strcmp( sArgs.to_arg, c_szBin );
00066         if( sArgs.output_arg )
00067             ofsm.open( sArgs.output_arg, fBinOut ? ios_base::binary : ios_base::out );
00068         ifsm.open( sArgs.input_arg, fBinIn ? ios_base::binary : ios_base::in );
00069         if( !OpenMat( ifsm, fBinIn, sArgs.output_arg ? (ostream&)ofsm : cout, sArgs.output_arg && fBinOut ) ) {
00070             cerr << "Couldn't open: " << sArgs.input_arg << endl;
00071             return 1; }
00072         ifsm.close( ); }
00073     else if( !strcmp( sArgs.from_arg, c_szBin ) ) {
00074         if( sArgs.output_arg )
00075             ofsm.open( sArgs.output_arg );
00076         ifsm.open( sArgs.input_arg, ios_base::binary );
00077         if( !OpenBin( ifsm, sArgs.output_arg ? (ostream&)ofsm : cout ) ) {
00078             cerr << "Couldn't open: " << sArgs.input_arg << endl;
00079             return 1; }
00080         ifsm.close( ); }
00081     else if( !strcmp( sArgs.from_arg, c_szDat ) ) {
00082         if( !sArgs.output_arg ) {
00083             cmdline_parser_print_help( );
00084             return 1; }
00085         ofsm.open( sArgs.output_arg, ios_base::binary );
00086         if( !Answers.Open( sArgs.answers_arg, true ) ) {
00087             cerr << "Couldn't open: " << sArgs.answers_arg << endl;
00088             return 1; }
00089         vecstrDats.resize( sArgs.inputs_num );
00090         for( i = 0; i < vecstrDats.size( ); ++i )
00091             vecstrDats[ i ] = sArgs.inputs[ i ];
00092         if( !OpenDats( Answers, vecstrDats, ofsm, GenesIn, GenesEx ) ) {
00093             cerr << "Couldn't open DAT files" << endl;
00094             return 1; } }
00095     else {
00096         if( !sArgs.output_arg ) {
00097             cmdline_parser_print_help( );
00098             return 1; }
00099         if( sArgs.input_arg )
00100             ifsm.open( sArgs.input_arg );
00101         if( !OpenText( sArgs.input_arg ? (istream&)ifsm : cin, ofsm ) ) {
00102             cerr << "Couldn't open: " << ( sArgs.input_arg ? sArgs.input_arg : "stdin" ) <<
00103                 endl;
00104             return 1; }
00105         if( sArgs.input_arg )
00106             ifsm.close( ); }
00107 
00108     if( sArgs.output_arg )
00109         ofsm.close( );
00110     else
00111         cout.flush( );
00112 
00113     return 0; }
00114 
00115 bool OpenDats( const CDataPair& Answers, const vector<string>& vecstrDATs, ostream& ostm,
00116     const CGenes& GenesIn, const CGenes& GenesEx ) {
00117     uint32_t        i, j, k, iPairs;
00118     CBinaryMatrix   Pairs;
00119     vector<size_t>  veciGenes;
00120     float           d;
00121 
00122     Pairs.Initialize( Answers.GetGenes( ) );
00123     for( i = 0; i < Pairs.GetSize( ); ++i )
00124         for( j = ( i + 1 ); j < Pairs.GetSize( ); ++j )
00125             Pairs.Set( i, j, false );
00126 
00127     veciGenes.resize( Answers.GetGenes( ) );
00128     for( i = 0; i < vecstrDATs.size( ); ++i ) {
00129         CDat    Dat;
00130 
00131         if( !Dat.Open( vecstrDATs[ i ].c_str( ) ) ) {
00132             cerr << "Couldn't open: " << vecstrDATs[ i ] << endl;
00133             return false; }
00134         cerr << "OpenDats( ) testing " << vecstrDATs[ i ] << endl;
00135         for( j = 0; j < veciGenes.size( ); ++j )
00136             veciGenes[ j ] = Dat.GetGene( Answers.GetGene( j ) );
00137         for( j = 0; j < Pairs.GetSize( ); ++j )
00138             if( veciGenes[ j ] != -1 )
00139                 for( k = ( j + 1 ); k < Pairs.GetSize( ); ++k )
00140                     if( ( veciGenes[ k ] != -1 ) && !Pairs.Get( j, k ) &&
00141                         !CMeta::IsNaN( Dat.Get( veciGenes[ j ], veciGenes[ k ] ) ) )
00142                         Pairs.Set( j, k, true ); }
00143 
00144     if( GenesEx.GetGenes( ) ) {
00145         for( i = 0; i < veciGenes.size( ); ++i )
00146             veciGenes[ i ] = GenesEx.IsGene( Answers.GetGene( i ) );
00147         for( i = 0; i < Pairs.GetSize( ); ++i ) {
00148             if( veciGenes[ i ] ) {
00149                 for( j = ( i + 1 ); j < Pairs.GetSize( ); ++j )
00150                     Pairs.Set( i, j, false );
00151                 continue; }
00152             for( j = ( i + 1 ); j < Pairs.GetSize( ); ++j )
00153                 if( veciGenes[ j ] )
00154                     Pairs.Set( i, j, false ); } }
00155     if( GenesIn.GetGenes( ) ) {
00156         for( i = 0; i < veciGenes.size( ); ++i )
00157             veciGenes[ i ] = GenesIn.IsGene( Answers.GetGene( i ) );
00158         for( i = 0; i < Pairs.GetSize( ); ++i ) {
00159             if( !veciGenes[ i ] )
00160                 for( j = ( i + 1 ); j < Pairs.GetSize( ); ++j )
00161                     if( !veciGenes[ j ] )
00162                         Pairs.Set( i, j, false ); } }
00163 
00164     cerr << "OpenDats( ) storing answers" << endl;
00165     k = 2 * sizeof(iPairs);
00166     ostm.seekp( k, ios_base::beg );
00167     for( iPairs = i = 0; i < Pairs.GetSize( ); ++i )
00168         for( j = ( i + 1 ); j < Pairs.GetSize( ); ++j )
00169             if( CMeta::IsNaN( d = Answers.Get( i, j ) ) )
00170                 Pairs.Set( i, j, false );
00171             else if( Pairs.Get( i, j ) ) {
00172                 d = d ? 1 : -1.0f;
00173                 ostm.write( (char*)&d, sizeof(d) );
00174                 ostm.seekp( (ostream::off_type)( vecstrDATs.size( ) * sizeof(float) ),
00175                     ios_base::cur );
00176                 ostm.write( (char*)&k, sizeof(k) );
00177                 ostm.write( (char*)&i, sizeof(i) );
00178                 ostm.write( (char*)&j, sizeof(j) );
00179                 iPairs++; }
00180 
00181     ostm.seekp( 0, ios_base::beg );
00182     i = (uint32_t)vecstrDATs.size( );
00183     ostm.write( (char*)&i, sizeof(i) );
00184     ostm.write( (char*)&iPairs, sizeof(iPairs) );
00185     for( i = 0; i < vecstrDATs.size( ); ++i ) {
00186         CDat    Dat;
00187 
00188         if( !Dat.Open( vecstrDATs[ i ].c_str( ) ) ) {
00189             cerr << "Couldn't open: " << vecstrDATs[ i ] << endl;
00190             return false; }
00191         cerr << "OpenDats( ) storing " << vecstrDATs[ i ] << endl;
00192         for( j = 0; j < veciGenes.size( ); ++j )
00193             veciGenes[ j ] = Dat.GetGene( Answers.GetGene( j ) );
00194         ostm.seekp( sizeof(float) + ( 2 * sizeof(iPairs) ) + ( i * sizeof(float) ),
00195             ios_base::beg );
00196         for( j = 0; j < Pairs.GetSize( ); ++j )
00197             for( k = ( j + 1 ); k < Pairs.GetSize( ); ++k )
00198                 if( Pairs.Get( j, k ) ) {
00199                     d = ( ( veciGenes[ j ] == -1 ) || ( veciGenes[ k ] == -1 ) ||
00200                         CMeta::IsNaN( d = Dat.Get( veciGenes[ j ], veciGenes[ k ] ) ) ) ?
00201                         0 : ( 2 * d ) - 1;
00202                     ostm.write( (char*)&d, sizeof(d) );
00203                     ostm.seekp( (ostream::off_type)( ( 3 * sizeof(iPairs) ) +
00204                         ( vecstrDATs.size( ) * sizeof(float) ) ), ios_base::cur ); } }
00205     ostm.seekp( 0, ios_base::end );
00206     i = (uint32_t)Answers.GetGenes( );
00207     ostm.write( (char*)&i, sizeof(i) );
00208     for( j = i = 0; i < Answers.GetGenes( ); ++i ) {
00209         const string&   strGene = Answers.GetGene( i );
00210 
00211         ostm.write( strGene.c_str( ), (streamsize)strGene.length( ) );
00212         ostm.write( (char*)&j, 1 ); }
00213 
00214     return true; }
00215 
00216 bool OpenText( istream& istm, ostream& ostm ) {
00217 
00218     return false; }
00219 
00220 bool OpenBin( istream& istm, ostream& ostm ) {
00221     static const size_t c_iSize = 512;
00222     char            sz[ c_iSize ];
00223     char*           pc;
00224     uint32_t        i, j, k, iWords, iDocs, iGenes;
00225     float*          ad;
00226     vector<string>  vecstrGenes;
00227 
00228     istm.read( (char*)&iWords, sizeof(iWords) );
00229     istm.read( (char*)&iDocs, sizeof(iDocs) );
00230 
00231     istm.seekg( (istream::off_type)( iDocs * ( ( ( iWords + 1 ) * sizeof(float) ) +
00232         ( 3 * sizeof(iWords) ) ) ), ios_base::cur );
00233     istm.read( (char*)&iGenes, sizeof(iGenes) );
00234     vecstrGenes.resize( iGenes );
00235     for( i = 0; i < vecstrGenes.size( ); ++i ) {
00236         for( pc = sz; ; ++pc ) {
00237             istm.read( pc, 1 );
00238             if( !*pc )
00239                 break; }
00240         vecstrGenes[ i ] = sz; }
00241     istm.seekg( 2 * sizeof(iWords), ios_base::beg );
00242 
00243     ad = new float[ iWords + 1 ];
00244     for( i = 0; i < iDocs; ++i ) {
00245         istm.read( (char*)ad, (streamsize)( iWords + 1 ) * sizeof(*ad) );
00246         cout << ad[ 0 ];
00247         for( j = 1; j <= iWords; ++j )
00248             cout << '\t' << (unsigned int)j << ':' << ad[ j ];
00249         istm.read( (char*)&j, sizeof(j) );
00250         if( j == ( 2 * sizeof(iWords) ) ) {
00251             istm.read( (char*)&j, sizeof(j) );
00252             istm.read( (char*)&k, sizeof(k) );
00253             cout << " # " << vecstrGenes[ j ] << '\t' << vecstrGenes[ k ]; }
00254         else if( j ) {
00255             istm.read( sz, (streamsize)j );
00256             sz[ j ] = 0;
00257             cout << " # " << sz; }
00258         cout << endl; }
00259 
00260     return true; }
00261 
00262 bool OpenMat( istream& istm, bool fBinIn, ostream& ostm, bool fBinOut ) {
00263     CDataMatrix Mat;
00264 
00265     if( !Mat.Open( istm, fBinIn ) )
00266         return false;
00267     Mat.Save( ostm, fBinOut );
00268 
00269     return true; }