Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 const char c_szTxt[] = "txt"; 00026 const char c_szBin[] = "bin"; 00027 const char c_szDat[] = "dat"; 00028 00029 bool OpenBin( istream&, ostream& ); 00030 bool OpenMat( istream&, bool, ostream&, bool ); 00031 bool OpenText( istream&, ostream& ); 00032 bool OpenDats( const CDataPair&, const vector<string>&, ostream&, const CGenes&, 00033 const CGenes& ); 00034 00035 int main( int iArgs, char** aszArgs ) { 00036 gengetopt_args_info sArgs; 00037 ifstream ifsm; 00038 vector<string> vecstrDats; 00039 size_t i; 00040 ofstream ofsm; 00041 CGenome Genome; 00042 CGenes GenesEx( Genome ), GenesIn( Genome ); 00043 CDataPair Answers; 00044 00045 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00046 cmdline_parser_print_help( ); 00047 return 1; } 00048 CMeta Meta( sArgs.verbosity_arg ); 00049 00050 if( sArgs.genex_arg ) { 00051 ifsm.open( sArgs.genex_arg ); 00052 GenesEx.Open( ifsm ); 00053 ifsm.close( ); } 00054 if( sArgs.genes_arg ) { 00055 ifsm.clear( ); 00056 ifsm.open( sArgs.genes_arg ); 00057 GenesIn.Open( ifsm ); 00058 ifsm.close( ); } 00059 00060 ifsm.clear( ); 00061 if( sArgs.matrix_flag ) { 00062 bool fBinIn, fBinOut; 00063 00064 fBinIn = !strcmp( sArgs.from_arg, c_szBin ); 00065 fBinOut = !strcmp( sArgs.to_arg, c_szBin ); 00066 if( sArgs.output_arg ) 00067 ofsm.open( sArgs.output_arg, fBinOut ? ios_base::binary : ios_base::out ); 00068 ifsm.open( sArgs.input_arg, fBinIn ? ios_base::binary : ios_base::in ); 00069 if( !OpenMat( ifsm, fBinIn, sArgs.output_arg ? (ostream&)ofsm : cout, sArgs.output_arg && fBinOut ) ) { 00070 cerr << "Couldn't open: " << sArgs.input_arg << endl; 00071 return 1; } 00072 ifsm.close( ); } 00073 else if( !strcmp( sArgs.from_arg, c_szBin ) ) { 00074 if( sArgs.output_arg ) 00075 ofsm.open( sArgs.output_arg ); 00076 ifsm.open( sArgs.input_arg, ios_base::binary ); 00077 if( !OpenBin( ifsm, sArgs.output_arg ? (ostream&)ofsm : cout ) ) { 00078 cerr << "Couldn't open: " << sArgs.input_arg << endl; 00079 return 1; } 00080 ifsm.close( ); } 00081 else if( !strcmp( sArgs.from_arg, c_szDat ) ) { 00082 if( !sArgs.output_arg ) { 00083 cmdline_parser_print_help( ); 00084 return 1; } 00085 ofsm.open( sArgs.output_arg, ios_base::binary ); 00086 if( !Answers.Open( sArgs.answers_arg, true ) ) { 00087 cerr << "Couldn't open: " << sArgs.answers_arg << endl; 00088 return 1; } 00089 vecstrDats.resize( sArgs.inputs_num ); 00090 for( i = 0; i < vecstrDats.size( ); ++i ) 00091 vecstrDats[ i ] = sArgs.inputs[ i ]; 00092 if( !OpenDats( Answers, vecstrDats, ofsm, GenesIn, GenesEx ) ) { 00093 cerr << "Couldn't open DAT files" << endl; 00094 return 1; } } 00095 else { 00096 if( !sArgs.output_arg ) { 00097 cmdline_parser_print_help( ); 00098 return 1; } 00099 if( sArgs.input_arg ) 00100 ifsm.open( sArgs.input_arg ); 00101 if( !OpenText( sArgs.input_arg ? (istream&)ifsm : cin, ofsm ) ) { 00102 cerr << "Couldn't open: " << ( sArgs.input_arg ? sArgs.input_arg : "stdin" ) << 00103 endl; 00104 return 1; } 00105 if( sArgs.input_arg ) 00106 ifsm.close( ); } 00107 00108 if( sArgs.output_arg ) 00109 ofsm.close( ); 00110 else 00111 cout.flush( ); 00112 00113 return 0; } 00114 00115 bool OpenDats( const CDataPair& Answers, const vector<string>& vecstrDATs, ostream& ostm, 00116 const CGenes& GenesIn, const CGenes& GenesEx ) { 00117 uint32_t i, j, k, iPairs; 00118 CBinaryMatrix Pairs; 00119 vector<size_t> veciGenes; 00120 float d; 00121 00122 Pairs.Initialize( Answers.GetGenes( ) ); 00123 for( i = 0; i < Pairs.GetSize( ); ++i ) 00124 for( j = ( i + 1 ); j < Pairs.GetSize( ); ++j ) 00125 Pairs.Set( i, j, false ); 00126 00127 veciGenes.resize( Answers.GetGenes( ) ); 00128 for( i = 0; i < vecstrDATs.size( ); ++i ) { 00129 CDat Dat; 00130 00131 if( !Dat.Open( vecstrDATs[ i ].c_str( ) ) ) { 00132 cerr << "Couldn't open: " << vecstrDATs[ i ] << endl; 00133 return false; } 00134 cerr << "OpenDats( ) testing " << vecstrDATs[ i ] << endl; 00135 for( j = 0; j < veciGenes.size( ); ++j ) 00136 veciGenes[ j ] = Dat.GetGene( Answers.GetGene( j ) ); 00137 for( j = 0; j < Pairs.GetSize( ); ++j ) 00138 if( veciGenes[ j ] != -1 ) 00139 for( k = ( j + 1 ); k < Pairs.GetSize( ); ++k ) 00140 if( ( veciGenes[ k ] != -1 ) && !Pairs.Get( j, k ) && 00141 !CMeta::IsNaN( Dat.Get( veciGenes[ j ], veciGenes[ k ] ) ) ) 00142 Pairs.Set( j, k, true ); } 00143 00144 if( GenesEx.GetGenes( ) ) { 00145 for( i = 0; i < veciGenes.size( ); ++i ) 00146 veciGenes[ i ] = GenesEx.IsGene( Answers.GetGene( i ) ); 00147 for( i = 0; i < Pairs.GetSize( ); ++i ) { 00148 if( veciGenes[ i ] ) { 00149 for( j = ( i + 1 ); j < Pairs.GetSize( ); ++j ) 00150 Pairs.Set( i, j, false ); 00151 continue; } 00152 for( j = ( i + 1 ); j < Pairs.GetSize( ); ++j ) 00153 if( veciGenes[ j ] ) 00154 Pairs.Set( i, j, false ); } } 00155 if( GenesIn.GetGenes( ) ) { 00156 for( i = 0; i < veciGenes.size( ); ++i ) 00157 veciGenes[ i ] = GenesIn.IsGene( Answers.GetGene( i ) ); 00158 for( i = 0; i < Pairs.GetSize( ); ++i ) { 00159 if( !veciGenes[ i ] ) 00160 for( j = ( i + 1 ); j < Pairs.GetSize( ); ++j ) 00161 if( !veciGenes[ j ] ) 00162 Pairs.Set( i, j, false ); } } 00163 00164 cerr << "OpenDats( ) storing answers" << endl; 00165 k = 2 * sizeof(iPairs); 00166 ostm.seekp( k, ios_base::beg ); 00167 for( iPairs = i = 0; i < Pairs.GetSize( ); ++i ) 00168 for( j = ( i + 1 ); j < Pairs.GetSize( ); ++j ) 00169 if( CMeta::IsNaN( d = Answers.Get( i, j ) ) ) 00170 Pairs.Set( i, j, false ); 00171 else if( Pairs.Get( i, j ) ) { 00172 d = d ? 1 : -1.0f; 00173 ostm.write( (char*)&d, sizeof(d) ); 00174 ostm.seekp( (ostream::off_type)( vecstrDATs.size( ) * sizeof(float) ), 00175 ios_base::cur ); 00176 ostm.write( (char*)&k, sizeof(k) ); 00177 ostm.write( (char*)&i, sizeof(i) ); 00178 ostm.write( (char*)&j, sizeof(j) ); 00179 iPairs++; } 00180 00181 ostm.seekp( 0, ios_base::beg ); 00182 i = (uint32_t)vecstrDATs.size( ); 00183 ostm.write( (char*)&i, sizeof(i) ); 00184 ostm.write( (char*)&iPairs, sizeof(iPairs) ); 00185 for( i = 0; i < vecstrDATs.size( ); ++i ) { 00186 CDat Dat; 00187 00188 if( !Dat.Open( vecstrDATs[ i ].c_str( ) ) ) { 00189 cerr << "Couldn't open: " << vecstrDATs[ i ] << endl; 00190 return false; } 00191 cerr << "OpenDats( ) storing " << vecstrDATs[ i ] << endl; 00192 for( j = 0; j < veciGenes.size( ); ++j ) 00193 veciGenes[ j ] = Dat.GetGene( Answers.GetGene( j ) ); 00194 ostm.seekp( sizeof(float) + ( 2 * sizeof(iPairs) ) + ( i * sizeof(float) ), 00195 ios_base::beg ); 00196 for( j = 0; j < Pairs.GetSize( ); ++j ) 00197 for( k = ( j + 1 ); k < Pairs.GetSize( ); ++k ) 00198 if( Pairs.Get( j, k ) ) { 00199 d = ( ( veciGenes[ j ] == -1 ) || ( veciGenes[ k ] == -1 ) || 00200 CMeta::IsNaN( d = Dat.Get( veciGenes[ j ], veciGenes[ k ] ) ) ) ? 00201 0 : ( 2 * d ) - 1; 00202 ostm.write( (char*)&d, sizeof(d) ); 00203 ostm.seekp( (ostream::off_type)( ( 3 * sizeof(iPairs) ) + 00204 ( vecstrDATs.size( ) * sizeof(float) ) ), ios_base::cur ); } } 00205 ostm.seekp( 0, ios_base::end ); 00206 i = (uint32_t)Answers.GetGenes( ); 00207 ostm.write( (char*)&i, sizeof(i) ); 00208 for( j = i = 0; i < Answers.GetGenes( ); ++i ) { 00209 const string& strGene = Answers.GetGene( i ); 00210 00211 ostm.write( strGene.c_str( ), (streamsize)strGene.length( ) ); 00212 ostm.write( (char*)&j, 1 ); } 00213 00214 return true; } 00215 00216 bool OpenText( istream& istm, ostream& ostm ) { 00217 00218 return false; } 00219 00220 bool OpenBin( istream& istm, ostream& ostm ) { 00221 static const size_t c_iSize = 512; 00222 char sz[ c_iSize ]; 00223 char* pc; 00224 uint32_t i, j, k, iWords, iDocs, iGenes; 00225 float* ad; 00226 vector<string> vecstrGenes; 00227 00228 istm.read( (char*)&iWords, sizeof(iWords) ); 00229 istm.read( (char*)&iDocs, sizeof(iDocs) ); 00230 00231 istm.seekg( (istream::off_type)( iDocs * ( ( ( iWords + 1 ) * sizeof(float) ) + 00232 ( 3 * sizeof(iWords) ) ) ), ios_base::cur ); 00233 istm.read( (char*)&iGenes, sizeof(iGenes) ); 00234 vecstrGenes.resize( iGenes ); 00235 for( i = 0; i < vecstrGenes.size( ); ++i ) { 00236 for( pc = sz; ; ++pc ) { 00237 istm.read( pc, 1 ); 00238 if( !*pc ) 00239 break; } 00240 vecstrGenes[ i ] = sz; } 00241 istm.seekg( 2 * sizeof(iWords), ios_base::beg ); 00242 00243 ad = new float[ iWords + 1 ]; 00244 for( i = 0; i < iDocs; ++i ) { 00245 istm.read( (char*)ad, (streamsize)( iWords + 1 ) * sizeof(*ad) ); 00246 cout << ad[ 0 ]; 00247 for( j = 1; j <= iWords; ++j ) 00248 cout << '\t' << (unsigned int)j << ':' << ad[ j ]; 00249 istm.read( (char*)&j, sizeof(j) ); 00250 if( j == ( 2 * sizeof(iWords) ) ) { 00251 istm.read( (char*)&j, sizeof(j) ); 00252 istm.read( (char*)&k, sizeof(k) ); 00253 cout << " # " << vecstrGenes[ j ] << '\t' << vecstrGenes[ k ]; } 00254 else if( j ) { 00255 istm.read( sz, (streamsize)j ); 00256 sz[ j ] = 0; 00257 cout << " # " << sz; } 00258 cout << endl; } 00259 00260 return true; } 00261 00262 bool OpenMat( istream& istm, bool fBinIn, ostream& ostm, bool fBinOut ) { 00263 CDataMatrix Mat; 00264 00265 if( !Mat.Open( istm, fBinIn ) ) 00266 return false; 00267 Mat.Save( ostm, fBinOut ); 00268 00269 return true; }