Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 00024 static const char c_acDab[] = ".dab"; 00025 static const char c_acQDab[] = ".qdab"; 00026 static const char c_acDat[] = ".dat"; 00027 static const char c_acSVM[] = ".svm"; 00028 static const char c_acOUT[] = ".out"; 00029 00030 enum EMethod { 00031 EMethodBegin = 0, 00032 EMethodMean = EMethodBegin, 00033 EMethodMax = EMethodMean + 1, 00034 EMethodQuant = EMethodMax + 1, 00035 EMethodEnd = EMethodQuant + 1 00036 }; 00037 00038 static const char* c_aszMethods[] = { 00039 "mean", "max", "quant",NULL 00040 }; 00041 00042 00043 float Percentile(vector<float>& vecVals, float quartile) { 00044 //size_t iOne, iTwo, iSize; 00045 //float d, dFrac; 00046 size_t iSize; 00047 float value, value2; 00048 00049 iSize = vecVals.size(); 00050 if(iSize == 0) 00051 return CMeta::GetNaN(); 00052 00053 if(iSize == 1) 00054 return vecVals[0]; 00055 00056 std::sort(vecVals.begin(), vecVals.end()); 00057 00058 float index = quartile * (iSize + 1); 00059 float remainder = index - (size_t)index; 00060 00061 index = (size_t)index - 1; 00062 00063 if(remainder == 0){ 00064 return vecVals[(size_t)index]; 00065 }else{ 00066 if(remainder > 0.5){ 00067 value = vecVals[(size_t)index-1]; 00068 value2 = vecVals[(size_t)index]; 00069 }else{ 00070 value = vecVals[(size_t)index]; 00071 value2 = vecVals[(size_t)index+1]; 00072 } 00073 float interpolationValue = (value2 - value) * remainder; 00074 00075 cerr << value << " " << value2 << " " << remainder <<endl; 00076 //float interpolationValue = (value - vecVals[((size_t)index-1)]) * remainder; 00077 //cerr << interpolationValue << endl; 00078 00079 //return (vecVals[(size_t)index] + interpolationValue); 00080 return (value + interpolationValue); 00081 } 00082 } 00083 00084 int main( int iArgs, char** aszArgs ) { 00085 gengetopt_args_info sArgs; 00086 int iRet; 00087 size_t i, j, k, l; 00088 size_t iRuns, inputNums; 00089 float d, dout; 00090 DIR* dp; 00091 struct dirent* ep; 00092 CDat DatOut, DatTrack, DatCur; 00093 vector<size_t> veciGenesCur; 00094 // store all input file names 00095 vector<string> input_files; 00096 vector<string> vecstrInputs, vecstrDatasets; 00097 EMethod eMethod; 00098 ifstream ifsm; 00099 vector<float> vecWeights; 00100 map<string, size_t> mapstriDatasets; 00101 CGenome Genome; 00102 CGenes Genes( Genome ); 00103 00104 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00105 cmdline_parser_print_help( ); 00106 return 1; } 00107 CMeta Meta( sArgs.verbosity_arg ); 00108 00109 for( eMethod = EMethodBegin; eMethod < EMethodEnd; eMethod = (EMethod)( eMethod + 1 ) ) 00110 if( !strcmp( c_aszMethods[eMethod], sArgs.method_arg ) ){ 00111 cerr << "combine method: " << c_aszMethods[eMethod] << endl; 00112 break; 00113 } 00114 00115 // now collect the data files from directory if given 00116 if(sArgs.directory_given){ 00117 dp = opendir (sArgs.directory_arg); 00118 i = 0; 00119 if (dp != NULL){ 00120 while (ep = readdir (dp)){ 00121 if( strstr(ep->d_name, c_acDab) != NULL || 00122 strstr(ep->d_name, c_acQDab) != NULL || 00123 strstr(ep->d_name, c_acDat) != NULL 00124 ){ 00125 if(strstr(ep->d_name, c_acSVM) != NULL) 00126 continue; 00127 if(strstr(ep->d_name, c_acOUT) != NULL) 00128 continue; 00129 vecstrInputs.push_back((string)sArgs.directory_arg + "/" + ep->d_name); 00130 vecstrDatasets.push_back(vecstrInputs[i]); 00131 mapstriDatasets[ ep->d_name ] = i; 00132 ++i; 00133 } 00134 } 00135 (void) closedir (dp); 00136 00137 inputNums = vecstrDatasets.size(); 00138 00139 // sort by ASCI 00140 //std::sort( vecstrInputs.begin(), vecstrInputs.end() ); 00141 00142 cerr << "Number of datasets: " << inputNums << '\n'; 00143 } 00144 else{ 00145 cerr << "Couldn't open the directory: " << sArgs.directory_arg << '\n'; 00146 return 1; 00147 } 00148 } 00149 00150 // read dataset weight file 00151 if( sArgs.weight_given ) { 00152 static const size_t c_iBuffer = 1024; 00153 char szBuffer[ c_iBuffer ]; 00154 string strFile; 00155 vector<string> vecstrLine; 00156 map<string, size_t>::const_iterator iterDataset; 00157 size_t numDataset; 00158 vecWeights.resize(vecstrInputs.size( ) ); 00159 00160 // set default weight 00161 for(i=0; i < vecWeights.size(); i++){ 00162 vecWeights[i] = 0; 00163 } 00164 00165 ifsm.clear( ); 00166 ifsm.open( sArgs.weight_arg ); 00167 if( !ifsm.is_open( ) ) { 00168 cerr << "Could not open: " << sArgs.weight_arg << endl; 00169 return 1; 00170 } 00171 while( !ifsm.eof( ) ) { 00172 ifsm.getline( szBuffer, c_iBuffer - 1 ); 00173 szBuffer[ c_iBuffer - 1 ] = 0; 00174 if( !szBuffer[ 0 ] ) 00175 continue; 00176 vecstrLine.clear( ); 00177 CMeta::Tokenize( szBuffer, vecstrLine ); 00178 if( vecstrLine.size( ) != 2 ) { 00179 cerr << "Illegal weight line: " << szBuffer << endl; 00180 return 1; 00181 } 00182 00183 if( ( iterDataset = mapstriDatasets.find( vecstrLine[ 0 ] ) ) == mapstriDatasets.end( ) ) 00184 cerr << "Dataset in weights but not database: " << vecstrLine[ 0 ] << endl; 00185 else 00186 vecWeights[ iterDataset->second ] = (float)atof( vecstrLine[ 1 ].c_str( ) ); 00187 } 00188 ifsm.close( ); 00189 00190 // now only keep datasets with non-zero weighting 00191 vecstrDatasets.clear(); 00192 numDataset = 0; 00193 for(i = 0; i < vecstrInputs.size(); ++i){ 00194 if(vecWeights[i] == 0) 00195 continue; 00196 vecstrDatasets.push_back(vecstrInputs[i]); 00197 vecWeights[numDataset] = vecWeights[i]; 00198 numDataset++; 00199 } 00200 vecstrDatasets.resize(numDataset); 00201 vecWeights.resize(numDataset); 00202 00203 cerr << "Total number of datasets combining with non-zero weights: " << numDataset << endl; 00204 } 00205 00208 if( eMethod == EMethodQuant ){ 00209 CDatasetCompact Dataset; 00210 vector<float> vecVals; 00211 size_t val; 00212 00213 if( !Dataset.Open( vecstrDatasets ) ) { 00214 cerr << "Couldn't open input datasets for quantile combine." << endl; 00215 return 1; } 00216 00217 // initialized the output dab and pair value vector 00218 DatOut.Open(Dataset.GetGeneNames()); 00219 vecVals.resize(Dataset.GetExperiments()); 00220 00221 // debug 00222 cerr << "num dataset: " << Dataset.GetExperiments() << endl; 00223 00224 for( i = 0; i < DatOut.GetGenes(); ++i ){ 00225 for( j = i+1; j < DatOut.GetGenes(); ++j ){ 00226 // iterate over each dataset 00227 vecVals.clear(); 00228 for( k = 0; k < Dataset.GetExperiments(); ++k ){ 00229 // debug 00230 //cerr << "Orig val: " << val << endl; 00231 00232 if( (val = Dataset.GetDiscrete( i, j, k)) == -1 ) 00233 continue; 00234 00235 // debug 00236 //cerr << "Got val: " << val << endl; 00237 if( sArgs.weight_given ){ 00238 val *= vecWeights[k]; 00239 } 00240 00241 vecVals.push_back(val); 00242 } 00243 00244 if(vecVals.size() < 1) 00245 continue; 00246 00247 // debug 00248 //cerr << "quant val: " << CStatistics::Percentile(vecVals.begin(), vecVals.end(), sArgs.quantile_arg) << endl; 00249 //cerr << "stats: " << vecVals.begin() << " " << vecVals.end() << " " << endl; //(vecVals.end()-vecVals.begin()) << endl; 00250 //cerr << "stats: " << (vecVals.end()-vecVals.begin()) << endl; 00251 DatOut.Set(i, j, CStatistics::Percentile(vecVals.begin(), vecVals.end(), sArgs.quantile_arg)); 00252 00253 //DatOut.Set(i, j, Percentile(vecVals, sArgs.quantile_arg)); 00254 //for(size_t t = 0; t < vecVals.size(); t++ ) 00255 // cerr << vecVals[t] << ' '; 00256 //cerr << endl; 00257 } 00258 } 00259 00260 DatOut.Save( sArgs.output_arg ); 00261 return 0; 00262 } 00263 00264 00265 // now iterate dat/dab networks 00266 for( i = 0; i < vecstrDatasets.size( ); ++i ) { 00267 00268 // open first network, we will just add edge weights to this CDat 00269 if( i == 0 ){ 00270 if( !DatCur.Open( vecstrDatasets[ i ].c_str() ) ) { 00271 cerr << "Couldn't open: " << vecstrDatasets[ i ] << endl; 00272 return 1; } 00273 00274 DatOut.Open( DatCur ); 00275 DatTrack.Open( DatCur ); 00276 00277 // this Dat is used to track various values (count, max) 00278 for( j = 0; j < DatTrack.GetGenes( ); ++j ) 00279 for( k = ( j + 1 ); k < DatTrack.GetGenes( ); ++k ){ 00280 if( CMeta::IsNaN( d = DatCur.Get( j, k))) 00281 continue; 00282 00283 if( sArgs.weight_given ){ 00284 d *= vecWeights[i]; 00285 // store weighted value (numerator) 00286 DatOut.Set( j, k, d); 00287 } 00288 00289 switch( eMethod ) { 00290 case EMethodMax: 00291 DatOut.Set( j, k, d); 00292 break; 00293 case EMethodMean: 00294 // keep track of denominator 00295 DatTrack.Set( j, k, 1.0); 00296 } 00297 } 00298 00299 cerr << "opened: " << vecstrDatasets[ i ] << endl; 00300 continue; 00301 } 00302 00303 if( !DatCur.Open( vecstrDatasets[ i ].c_str() ) ) { 00304 cerr << "Couldn't open: " << vecstrDatasets[ i ] << endl; 00305 return 1; } 00306 cerr << "opened: " << vecstrDatasets[ i ] << endl; 00307 00308 if( sArgs.map_flag ){ 00309 // Get gene index match 00310 cerr << "inside map flag" << endl; 00311 veciGenesCur.clear(); 00312 veciGenesCur.resize(DatOut.GetGenes()); 00313 for( l = 0; l < DatOut.GetGenes(); l++){ 00314 veciGenesCur[ l ] = DatCur.GetGene( DatOut.GetGene(l) ); 00315 if( veciGenesCur[ l ] == -1 ){ 00316 cerr << "ERROR: missing gene: " << DatOut.GetGene(l) << ", " << vecstrDatasets[ i ] << endl; 00317 return 1; 00318 } 00319 } 00320 } 00321 00322 // now add edges to Dat 00323 for( j = 0; j < DatOut.GetGenes( ); ++j ) 00324 for( k = ( j + 1 ); k < DatOut.GetGenes( ); ++k ) { 00325 00326 if( sArgs.map_flag ){ 00327 if( CMeta::IsNaN( d = DatCur.Get( veciGenesCur[ j ], veciGenesCur[ k ] ) ) ){ 00328 continue; 00329 } 00330 } 00331 else{ 00332 if( CMeta::IsNaN( d = DatCur.Get( j, k ) ) ){ 00333 continue; 00334 } 00335 } 00336 00337 if( sArgs.weight_given ) 00338 d *= vecWeights[i]; 00339 00340 switch( eMethod ) { 00341 case EMethodMax: 00342 if( CMeta::IsNaN( (dout = DatOut.Get( j, k ))) || d > dout ) 00343 DatOut.Set( j, k, d); 00344 break; 00345 00346 case EMethodMean: 00347 DatTrack.Set( j, k, DatTrack.Get( j, k ) + 1 ); 00348 00349 if( CMeta::IsNaN( (dout = DatOut.Get( j, k ))) ) 00350 DatOut.Set( j, k, d ); 00351 else 00352 DatOut.Set( j, k, DatOut.Get( j, k ) + d ); 00353 break; 00354 } 00355 } 00356 } 00357 00358 switch( eMethod ) { 00359 case EMethodMean: 00360 // now convert sum to mean 00361 for( j = 0; j < DatOut.GetGenes( ); ++j ) 00362 for( k = ( j + 1 ); k < DatOut.GetGenes( ); ++k ) 00363 DatOut.Set( j, k, DatOut.Get( j, k ) / DatTrack.Get( j, k ) ); 00364 } 00365 00366 // Filter dat 00367 if( sArgs.genes_given ) { 00368 ifsm.clear( ); 00369 ifsm.open( sArgs.genes_arg ); 00370 if( !Genes.Open( ifsm ) ) { 00371 cerr << "Could not open: " << sArgs.genes_arg << endl; 00372 return 1; } 00373 ifsm.close( ); 00374 00375 DatOut.FilterGenes( Genes, CDat::EFilterInclude ); 00376 } 00377 00378 if( sArgs.genee_given ){ 00379 cerr << "Filter only edges with genes in: " << sArgs.genee_arg << endl; 00380 if( ! DatOut.FilterGenes( sArgs.genee_arg, CDat::EFilterEdge ) ){ 00381 cerr << "ERROR can't open file: " << sArgs.genee_arg << endl; 00382 return 1; 00383 } 00384 } 00385 00386 DatOut.Save( sArgs.output_arg ); 00387 return iRet; 00388 }