Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 static const char c_szERROR[] = "ERROR"; 00026 static const char c_szAnnotated[] = "annotated"; 00027 static const char c_szValue[] = "value"; 00028 static const char c_cComment = '#'; 00029 static const char c_cDot = '.'; 00030 static const size_t c_iBuf = 1024; 00031 00032 struct SFeature { 00033 string m_strName; 00034 vector<string> m_vecstrValues; 00035 size_t m_iDefault; 00036 00037 SFeature( const string& strName ) : m_strName(strName), m_iDefault(-1) { } 00038 00039 size_t quantize( const string& strValue ) const { 00040 size_t i; 00041 00042 for( i = 0; i < m_vecstrValues.size( ); ++i ) 00043 if( strValue == m_vecstrValues[ i ] ) 00044 return i; 00045 00046 return -1; } 00047 }; 00048 00049 struct SDatum { 00050 string m_strName; 00051 map<size_t,size_t> m_mapiiFeatures; 00052 }; 00053 00054 int main( int iArgs, char** aszArgs ) { 00055 gengetopt_args_info sArgs; 00056 size_t i, j, k; 00057 vector<SFeature> vecsFeatures; 00058 vector<string> vecstrLine, vecstrToken; 00059 ifstream ifsm; 00060 char szBuf[ c_iBuf ]; 00061 map<string,SDatum> mapValues; 00062 CGenome Genome; 00063 CGenes GenesPos( Genome ); 00064 // vector<float> vecdQuants; 00065 00066 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00067 cmdline_parser_print_help( ); 00068 return 1; } 00069 CMeta Meta( sArgs.verbosity_arg ); 00070 00071 ifsm.open( sArgs.environment_arg ); 00072 if( !ifsm.is_open( ) ) { 00073 cerr << "Could not open: " << sArgs.environment_arg << endl; 00074 return 1; } 00075 while( ifsm.peek( ) != EOF ) { 00076 ifsm.getline( szBuf, c_iBuf - 1 ); 00077 vecstrLine.clear( ); 00078 CMeta::Tokenize( szBuf, vecstrLine ); 00079 if( ( vecstrLine.size( ) != 0 ) && ( vecstrLine[ 0 ][ 0 ] != c_cComment ) ) { 00080 vecsFeatures.push_back( SFeature( vecstrLine[ 0 ] ) ); 00081 { 00082 SFeature& sFeature = vecsFeatures[ vecsFeatures.size( ) - 1 ]; 00083 00084 vecstrToken.clear( ); 00085 CMeta::Tokenize( vecstrLine[ 1 ].c_str( ), vecstrToken, "|" ); 00086 sFeature.m_vecstrValues.resize( vecstrToken.size( ) ); 00087 copy( vecstrToken.begin( ), vecstrToken.end( ), sFeature.m_vecstrValues.begin( ) ); 00088 00089 if( vecstrLine.size( ) > 2 ) 00090 sFeature.m_iDefault = sFeature.quantize( vecstrLine[ 2 ] ); 00091 } } } 00092 ifsm.close( ); 00093 00094 ifsm.clear( ); 00095 ifsm.open( sArgs.data_arg ); 00096 if( !ifsm.is_open( ) ) { 00097 cerr << "Could not open: " << sArgs.data_arg << endl; 00098 return 1; } 00099 while( ifsm.peek( ) != EOF ) { 00100 ifsm.getline( szBuf, c_iBuf - 1 ); 00101 vecstrLine.clear( ); 00102 CMeta::Tokenize( szBuf, vecstrLine ); 00103 if( vecstrLine.size( ) == 0 ) 00104 continue; 00105 { 00106 SDatum& sCur = mapValues[ vecstrLine[ 0 ] ]; 00107 char* pc; 00108 00109 sCur.m_strName = vecstrLine[ 0 ]; 00110 for( i = 1; i < vecstrLine.size( ); ++i ) { 00111 vecstrToken.clear( ); 00112 CMeta::Tokenize( vecstrLine[ i ].c_str( ), vecstrToken, "|" ); 00113 if( vecstrToken.size( ) != 2 ) { 00114 cerr << "Illegal token in " << sArgs.data_arg << ": " << szBuf << endl; 00115 return 1; } 00116 for( j = 0; j < vecsFeatures.size( ); ++j ) 00117 if( vecstrToken[ 0 ] == vecsFeatures[ j ].m_strName ) 00118 break; 00119 if( j >= vecsFeatures.size( ) ) { 00120 cerr << "Unknown feature: " << vecstrLine[ i ] << endl; 00121 return 1; } 00122 k = strtol( vecstrToken[ 1 ].c_str( ), &pc, 10 ); 00123 sCur.m_mapiiFeatures[ j ] = ( pc != vecstrToken[ 1 ].c_str( )) ? k : 00124 vecsFeatures[ j ].quantize( vecstrToken[ 1 ] ); } 00125 } } 00126 ifsm.close( ); 00127 00128 /* 00129 ifsm.clear( ); 00130 ifsm.open( sArgs.quants_arg ); 00131 if( !ifsm.is_open( ) ) { 00132 cerr << "Could not open: " << sArgs.quants_arg << endl; 00133 return 1; } 00134 ifsm.getline( szBuf, c_iBuf - 1 ); 00135 vecstrLine.clear( ); 00136 CMeta::Tokenize( szBuf, vecstrLine ); 00137 vecdQuants.resize( vecstrLine.size( ) ); 00138 for( i = 0; i < vecdQuants.size( ); ++i ) 00139 vecdQuants[ i ] = (float)atof( vecstrLine[ i ].c_str( ) ); 00140 ifsm.close( ); 00141 */ 00142 00143 if( sArgs.genome_arg ) { 00144 ifsm.clear( ); 00145 ifsm.open( sArgs.genome_arg ); 00146 if( !Genome.Open( ifsm ) ) { 00147 cerr << "Could not open: " << sArgs.genome_arg << endl; 00148 return 1; } 00149 ifsm.close( ); } 00150 00151 if( sArgs.positives_arg ) { 00152 ifsm.clear( ); 00153 ifsm.open( sArgs.positives_arg ); } 00154 if( !GenesPos.Open( sArgs.positives_arg ? ifsm : cin ) ) { 00155 cerr << "Could not open: " << ( sArgs.positives_arg ? sArgs.positives_arg : "input genes" ) << endl; 00156 return 1; } 00157 if( sArgs.positives_arg ) 00158 ifsm.close( ); 00159 ifsm.clear( ); 00160 00161 cout << "<?xml version='1.0' encoding='utf-8'?>" << endl; 00162 cout << "<dataset name='datasets'>" << endl; 00163 cout << " <header>" << endl; 00164 cout << " <attributes>" << endl; 00165 /* 00166 cout << " <attribute name='annotated' type='nominal'>" << endl; 00167 cout << " <labels>" << endl; 00168 cout << " <label>1</label>" << endl; 00169 cout << " <label>2</label>" << endl; 00170 cout << " </labels>" << endl; 00171 cout << " </attribute>" << endl; 00172 */ 00173 cout << " <attribute name='value' type='numeric'/>" << endl; 00174 for( i = 0; i < vecsFeatures.size( ); ++i ) { 00175 cout << " <attribute name='" << vecsFeatures[ i ].m_strName << "' type='nominal'>" << endl; 00176 cout << " <labels>" << endl; 00177 for( j = 0; j < vecsFeatures[ i ].m_vecstrValues.size( ); ++j ) 00178 cout << " <label>" << ( j + 1 ) << "</label>" << endl; 00179 cout << " </labels>" << endl; 00180 cout << " </attribute>" << endl; } 00181 /* 00182 for( i = 0; i < Genome.GetGenes( ); ++i ) { 00183 cout << " <attribute name='" << Genome.GetGene( i ).GetName( ) << "' type='nominal'>" << endl; 00184 cout << " <labels>" << endl; 00185 cout << " <label>1</label>" << endl; 00186 cout << " <label>2</label>" << endl; 00187 cout << " </labels>" << endl; 00188 cout << " </attribute>" << endl; } 00189 */ 00190 cout << " </attributes>" << endl; 00191 cout << " </header>" << endl; 00192 00193 cout << " <body>" << endl; 00194 cout << " <instances>" << endl; 00195 for( i = 0; i < sArgs.inputs_num; ++i ) { 00196 CPCL PCL; 00197 CDat Dat; 00198 size_t iCount; 00199 vector<size_t> veciGenes; 00200 float d, dAverage; 00201 map<size_t,size_t> mapiiCur; 00202 int iRet; 00203 string strName; 00204 00205 cerr << "Processing: " << sArgs.inputs[ i ] << endl; 00206 00207 strName = CMeta::Basename( sArgs.inputs[ i ] ); 00208 while( ( j = strName.rfind( c_cDot ) ) != string::npos ) 00209 strName = strName.substr( 0, j ); 00210 const SDatum& sDatum = mapValues[ strName ]; 00211 00212 if( !Dat.Open( sArgs.inputs[ i ], !!sArgs.memmap_flag ) && ( iRet = CPCL::Distance( sArgs.inputs[ i ], 00213 sArgs.skip_arg, sArgs.distance_arg, !!sArgs.normalize_flag, !!sArgs.zscore_flag, false, 00214 sArgs.genome_arg, CMeta::GetNaN( ), -1, PCL, Dat ) ) ) { 00215 cerr << "Could not open: " << sArgs.inputs[ i ] << endl; 00216 cmdline_parser_print_help( ); 00217 return iRet; } 00218 00219 veciGenes.resize( GenesPos.GetGenes( ) ); 00220 for( j = 0; j < veciGenes.size( ); ++j ) 00221 veciGenes[ j ] = Dat.GetGene( GenesPos.GetGene( j ).GetName( ) ); 00222 dAverage = 0; 00223 for( iCount = j = 0; j < veciGenes.size( ); ++j ) { 00224 if( veciGenes[ j ] == -1 ) 00225 continue; 00226 for( k = ( j + 1 ); k < veciGenes.size( ); ++k ) 00227 if( ( veciGenes[ k ] != -1 ) && 00228 !CMeta::IsNaN( d = Dat.Get( veciGenes[ j ], veciGenes[ k ] ) ) ) { 00229 iCount++; 00230 dAverage += d; } } 00231 if( iCount ) 00232 dAverage /= iCount; 00233 /* 00234 adCentroid = new float[ PCL.GetExperiments( ) ]; 00235 for( iCount = i = 0; i < GenesPos.GetGenes( ); ++i ) { 00236 if( ( j = PCL.GetGene( GenesPos.GetGene( i ).GetName( ) ) ) == -1 ) 00237 continue; 00238 iCount++; 00239 for( k = 0; k < PCL.GetExperiments( ); ++k ) 00240 adCentroid[ k ] += PCL.Get( j, k ); } 00241 for( i = 0; i < PCL.GetExperiments( ); ++i ) 00242 adCentroid[ i ] /= iCount; 00243 */ 00244 00245 cout << " <instance>" << endl; 00246 cout << " <value>" << dAverage << "</value>" << endl; 00247 for( j = 0; j < vecsFeatures.size( ); ++j ) { 00248 const SFeature& sFeature = vecsFeatures[ j ]; 00249 map<size_t,size_t>::const_iterator iterCur; 00250 size_t iCur; 00251 00252 if( ( iterCur = sDatum.m_mapiiFeatures.find( j ) ) != sDatum.m_mapiiFeatures.end( ) ) 00253 iCur = iterCur->second; 00254 else 00255 iCur = sFeature.m_iDefault; 00256 cout << " <value>" << ( iCur + 1 ) << "</value>" << endl; } 00257 cout << " </instance>" << endl; } 00258 cout << " </instances>" << endl; 00259 cout << " </body>" << endl; 00260 cout << "</dataset>" << endl; 00261 00262 return 0; }