Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include <iostream> 00024 #include <fstream> 00025 #include <cmath> 00026 #include <limits> 00027 00028 00029 float logit(float x) 00030 { 00031 if(x == 0.0)//min float number 00032 return -15; 00033 //return -std::numeric_limits<float>::max(); 00034 //return std::numeric_limits<float>::min(); 00035 if(x == 1.0)//max float number 00036 return 15; 00037 //return std::numeric_limits<float>::max(); 00038 00039 return log( x / ( 1 - x ) ); 00040 } 00041 00042 float sigmoid(float x) 00043 { 00044 float exp_value; 00045 float return_value; 00046 00047 /*** Exponential calculation ***/ 00048 exp_value = exp((double) -x); 00049 00050 /*** Final sigmoid value ***/ 00051 return_value = 1 / (1 + exp_value); 00052 00053 return return_value; 00054 } 00055 00056 float equal_prior(float x, size_t p, size_t n) 00057 { 00058 float logit_value; 00059 float return_value; 00060 00061 logit_value = logit(x) + log( ((float) p) / n); // - log(n/p) ... subtract prior ... in effect give equal prior to 00062 return_value = sigmoid(logit_value); 00063 00064 return return_value; 00065 } 00066 00067 inline bool exists (const std::string& name) { 00068 struct stat buffer; 00069 return (stat (name.c_str(), &buffer) == 0); 00070 } 00071 00072 00073 int main( int iArgs, char** aszArgs ) { 00074 00075 // cout << sigmoid(logit(0.999999)) << endl; 00076 // cout << sigmoid(logit(0.9)) << endl; 00077 // cout << sigmoid(logit(0.1)) << endl; 00078 // cout << sigmoid(logit(1)) << endl; 00079 // cout << sigmoid(logit(0)) << endl; 00080 00081 00082 gengetopt_args_info sArgs; 00083 int iRet; 00084 size_t i, j, k, l; 00085 float d; 00086 DIR* dp; 00087 struct dirent* ep; 00088 CDat DatOut, DatCur; 00089 vector<size_t> veciGenesCur; 00090 00091 // context weight filename 00092 std::string weight_filename; 00093 std::vector<string> input_files; 00094 std::string dab_dir; 00095 00096 // store prior information 00097 vector<size_t> vpos; 00098 vector<size_t> vneg; 00099 00100 // read context weights 00101 std::string line; 00102 std::vector<std::string> terms; 00103 std::map<std::string,double> term2weight; 00104 double weight_sum = 0.0; 00105 00106 00107 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00108 cmdline_parser_print_help( ); 00109 return 1; } 00110 CMeta Meta( sArgs.verbosity_arg ); 00111 00112 //int type = sArgs.type_arg ; // type of combiner 00113 00114 // check if directory valid 00115 if(sArgs.directory_arg){ 00116 dp = opendir (sArgs.directory_arg); 00117 if (dp != NULL){ 00118 (void) closedir (dp); 00119 dab_dir = sArgs.directory_arg; 00120 } 00121 else{ 00122 cerr << "Couldn't open the directory: " << sArgs.directory_arg << '\n'; 00123 return 1; 00124 } 00125 } 00126 00127 if(sArgs.weights_arg){ // only if weight file given.. 00128 weight_filename = sArgs.weights_arg; 00129 std::ifstream weight_file(weight_filename.c_str()); 00130 cout << "Using weight file: " << weight_filename << endl; 00131 // read context weights 00132 if(weight_file.is_open()) 00133 { 00134 while(weight_file.good()) 00135 { 00136 std::string term; 00137 double weight; 00138 00139 weight_file >> term >> weight; 00140 //weight = abs(weight); 00141 //cout << term << "\t" << weight << endl; 00142 if(term.length() < 1){ 00143 continue; 00144 } 00145 if(weight < 0.0){//ignore weights less than zero 00146 continue; 00147 } 00148 cout << term << endl; 00149 terms.push_back(term); 00150 term2weight[term] = weight; 00151 weight_sum += weight; 00152 } 00153 weight_file.close(); 00154 } 00155 00156 cout << "The number of contexts with weight: " << terms.size() << endl; 00157 00158 // read only networks/contexts with weights 00159 std::vector<string> subset_terms; 00160 std::string dabname; 00161 for( j = 0; j < terms.size( ); ++j) { 00162 dabname = dab_dir + "/" + terms[j] + ".dab"; 00163 if(exists(dabname)){ 00164 cout << dabname; 00165 input_files.push_back(dabname); 00166 subset_terms.push_back(terms[j]); 00167 } 00168 //cout << pos << endl; 00169 } 00170 terms = subset_terms; 00171 }else{ 00172 00173 dp = opendir (dab_dir.c_str()); 00174 if (dp != NULL){ 00175 while (ep = readdir (dp)){ 00176 // skip . .. files and temp files with ~ 00177 if (ep->d_name[0] == '.' || ep->d_name[strlen(ep->d_name)-1] == '~') 00178 continue; 00179 if (std::string(ep->d_name).substr(strlen(ep->d_name)-4,4).compare(string(".dab")) == 0) 00180 input_files.push_back((string)sArgs.directory_arg + "/" + ep->d_name); 00181 } 00182 (void) closedir (dp); 00183 } 00184 00185 //TODO: temporary hacking based on implementation ... should improve 00186 // sort so that input file and term array are corresponding with index 00187 00188 std::sort(input_files.begin(),input_files.end()); 00189 for( i = 0; i < input_files.size( ); ++i){ 00190 terms.push_back(input_files[i]); 00191 term2weight[input_files[i]] = 1; 00192 } 00193 00194 //std::sort(terms.begin(),terms.end()); 00195 weight_sum = input_files.size( ); 00196 } 00197 00198 00199 // now iterate dat/dab networks 00200 for( i = 0; i < input_files.size( ); ++i ) { 00201 00202 // open dat/dab network 00203 if( !DatCur.Open( input_files[ i ].c_str() ) ) { 00204 cerr << "Couldn't open: " << input_files[ i ] << endl; 00205 return 1; } 00206 cerr << "opened: " << input_files[ i ] << endl; 00207 00208 00209 if( sArgs.prior_arg ){ 00210 for( j = 0; j < DatCur.GetGenes( ); ++j) 00211 for( k = ( j + 1 ); k < DatCur.GetGenes( ); ++k) 00212 DatCur.Set( j, k, equal_prior( DatCur.Get( j, k ), vpos[ i ], vneg[ i ] ) ); 00213 00214 } 00215 00216 if( sArgs.logit_flag ){ 00217 for( j = 0; j < DatCur.GetGenes( ); ++j) 00218 for( k = ( j + 1 ); k < DatCur.GetGenes( ); ++k) 00219 DatCur.Set( j, k, logit( DatCur.Get( j, k ) ) ); 00220 00221 }else{ 00222 } 00223 00224 if( sArgs.znormalize_flag ){ 00225 DatCur.Normalize( CDat::ENormalizeZScore ); 00226 }else{ 00227 } 00228 00229 cerr << term2weight[terms[i]] << endl; 00230 00231 00232 // if open first network, we will just add edge weights to this CDat 00233 if( i == 0 ){ 00234 DatOut.Open( DatCur ); 00235 for( j = 0; j < DatOut.GetGenes( ); ++j) 00236 //cerr << "set " << j << endl; 00237 00238 for( k = ( j + 1 ); k < DatOut.GetGenes( ); ++k){ 00239 // DatOut.Set( j, k, DatOut.Get( j, k ) ); 00240 DatOut.Set( j, k, DatOut.Get( j, k ) * term2weight[terms[i]] ); 00241 } 00242 continue; 00243 } 00244 00245 //cerr << "map flag" << endl; 00246 if( sArgs.map_flag ){ 00247 // Get gene index match 00248 veciGenesCur.clear(); 00249 veciGenesCur.resize(DatOut.GetGenes()); 00250 for( l = 0; l < DatOut.GetGenes(); l++){ 00251 veciGenesCur[ l ] = DatCur.GetGene( DatOut.GetGene(l) ); 00252 if( veciGenesCur[ l ] == -1 ){ 00253 cerr << "ERROR: missing gene" << input_files[ l ] << endl; 00254 return 1; 00255 } 00256 } 00257 } 00258 00259 cerr << "add a network" << endl; 00260 00261 // now add edges to Dat 00262 for( j = 0; j < DatOut.GetGenes( ); ++j ) 00263 for( k = ( j + 1 ); k < DatOut.GetGenes( ); ++k ) { 00264 00265 if( sArgs.map_flag ){ 00266 // we are assuming a fully connected network 00267 if( CMeta::IsNaN( d = DatCur.Get( veciGenesCur[ j ], veciGenesCur[ k ] ) ) ){ 00268 cerr << d << endl; 00269 cerr << veciGenesCur[ j ] << endl; 00270 cerr << veciGenesCur[ k ] << endl; 00271 cerr << "ERROR: missing values" << input_files[ i ] << endl; 00272 return 1; 00273 } 00274 } 00275 else{ 00276 if( CMeta::IsNaN( d = DatCur.Get( j, k ) ) ){ 00277 cerr << "ERROR: missing values" << input_files[ i ] << endl; 00278 return 1; 00279 } 00280 } 00281 00282 //cerr << d << endl; 00283 //cerr << terms.size() << endl; 00284 //for(int q = 0 ; q < terms.size() ; q++) 00285 // cerr << terms[q] << endl; 00286 //cerr << terms[i] << endl; 00287 //cerr << term2weight[terms[i]] << endl; 00288 00289 DatOut.Set( j, k, DatOut.Get( j, k ) + d * term2weight[terms[i]]) ; //weight edge based on context weight TODO: right now this implementation depends on the order of terms vector and niput_file vector to be correspondingi 00290 // DatOut.Set( j, k, DatOut.Get( j, k ) + d ) ;// 00291 00292 00293 } 00294 } 00295 00296 // now convert sum to mean 00297 for( j = 0; j < DatOut.GetGenes( ); ++j ) 00298 for( k = ( j + 1 ); k < DatOut.GetGenes( ); ++k ){ 00299 DatOut.Set( j, k, DatOut.Get( j, k ) / weight_sum ); 00300 00301 if( sArgs.logit_flag ){// transform back to probability values 00302 DatOut.Set( j, k, sigmoid( DatOut.Get( j, k ) ) ); 00303 } 00304 00305 } 00306 //DatOut.Set( j, k, DatOut.Get( j, k ) / input_files.size( ) ); 00307 00308 DatOut.Save( sArgs.output_arg ); 00309 return iRet; 00310 }