Sleipnir: tools/NetworkCombiner/NetworkCombiner.cpp Source File

00001 /*****************************************************************************
00002  * This file is provided under the Creative Commons Attribution 3.0 license.
00003  *
00004  * You are free to share, copy, distribute, transmit, or adapt this work
00005  * PROVIDED THAT you attribute the work to the authors listed below.
00006  * For more information, please see the following web page:
00007  * http://creativecommons.org/licenses/by/3.0/
00008  *
00009  * This file is a component of the Sleipnir library for functional genomics,
00010  * authored by:
00011  * Curtis Huttenhower (chuttenh@princeton.edu)
00012  * Mark Schroeder
00013  * Maria D. Chikina
00014  * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015  *
00016  * If you use this library, the included executable tools, or any related
00017  * code in your work, please cite the following publication:
00018  * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019  * Olga G. Troyanskaya.
00020  * "The Sleipnir library for computational functional genomics"
00021  *****************************************************************************/
00022 #include "stdafx.h"
00023 #include <iostream>
00024 #include <fstream>
00025 #include <cmath>
00026 #include <limits>
00027 
00028 
00029 float logit(float x)
00030 {
00031   if(x == 0.0)//min float number
00032     return -15;
00033   //return -std::numeric_limits<float>::max();
00034   //return std::numeric_limits<float>::min();
00035   if(x == 1.0)//max float number
00036     return 15;
00037   //return std::numeric_limits<float>::max();
00038 
00039   return log( x / ( 1 - x ) );
00040 }
00041 
00042 float sigmoid(float x)
00043 {
00044   float exp_value;
00045   float return_value;
00046 
00047   /*** Exponential calculation ***/
00048   exp_value = exp((double) -x);
00049 
00050   /*** Final sigmoid value ***/
00051   return_value = 1 / (1 + exp_value);
00052 
00053   return return_value;
00054 }
00055 
00056 float equal_prior(float x, size_t p, size_t n)
00057 {
00058   float logit_value;
00059   float return_value;
00060 
00061   logit_value = logit(x) + log( ((float) p) / n); // - log(n/p) ... subtract prior ... in effect give equal prior to 
00062   return_value = sigmoid(logit_value);
00063 
00064   return return_value;
00065 }
00066 
00067 inline bool exists (const std::string& name) {
00068   struct stat buffer;   
00069   return (stat (name.c_str(), &buffer) == 0); 
00070 }
00071 
00072 
00073 int main( int iArgs, char** aszArgs ) {
00074 
00075   //    cout << sigmoid(logit(0.999999)) << endl;
00076   //    cout << sigmoid(logit(0.9)) << endl;
00077   //    cout << sigmoid(logit(0.1)) << endl;
00078   //    cout << sigmoid(logit(1)) << endl;
00079   //    cout << sigmoid(logit(0)) << endl;
00080 
00081 
00082   gengetopt_args_info   sArgs;
00083   int                   iRet;
00084   size_t                i, j, k, l;
00085   float d;
00086   DIR* dp;
00087   struct dirent* ep;
00088   CDat                  DatOut, DatCur;
00089   vector<size_t>                veciGenesCur;   
00090 
00091   // context weight filename
00092   std::string weight_filename;
00093   std::vector<string> input_files;
00094   std::string dab_dir;
00095 
00096   // store prior information
00097   vector<size_t> vpos;
00098   vector<size_t> vneg;
00099 
00100   // read context weights
00101   std::string line;
00102   std::vector<std::string> terms;
00103   std::map<std::string,double> term2weight;
00104   double weight_sum = 0.0;
00105 
00106 
00107   if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00108     cmdline_parser_print_help( );
00109     return 1; }
00110     CMeta Meta( sArgs.verbosity_arg );
00111 
00112     //int type = sArgs.type_arg ; // type of combiner
00113 
00114     // check if directory valid
00115     if(sArgs.directory_arg){
00116       dp = opendir (sArgs.directory_arg);
00117       if (dp != NULL){
00118         (void) closedir (dp);       
00119         dab_dir = sArgs.directory_arg;
00120       }
00121       else{
00122         cerr << "Couldn't open the directory: " << sArgs.directory_arg << '\n';
00123         return 1;
00124       }
00125     }
00126 
00127     if(sArgs.weights_arg){ // only if weight file given..
00128       weight_filename = sArgs.weights_arg;
00129       std::ifstream weight_file(weight_filename.c_str());
00130       cout << "Using weight file: " << weight_filename << endl;
00131       // read context weights
00132       if(weight_file.is_open())
00133       {
00134         while(weight_file.good())
00135         {
00136           std::string term;
00137           double weight;
00138 
00139           weight_file >> term >> weight;
00140           //weight = abs(weight);
00141           //cout << term << "\t" << weight << endl;
00142           if(term.length() < 1){
00143             continue;
00144           }
00145           if(weight <  0.0){//ignore weights less than zero
00146             continue;
00147           }
00148           cout << term << endl;
00149           terms.push_back(term);
00150           term2weight[term] = weight;
00151           weight_sum += weight;
00152         }
00153         weight_file.close();
00154       }
00155 
00156       cout << "The number of contexts with weight: " << terms.size() << endl;   
00157 
00158       // read only networks/contexts with weights
00159       std::vector<string> subset_terms;
00160       std::string dabname;
00161       for( j = 0; j < terms.size( ); ++j) {
00162         dabname = dab_dir + "/" + terms[j] + ".dab";
00163         if(exists(dabname)){
00164           cout << dabname;
00165           input_files.push_back(dabname);
00166           subset_terms.push_back(terms[j]);
00167         }
00168         //cout << pos << endl;
00169       }
00170       terms = subset_terms;
00171     }else{
00172 
00173       dp = opendir (dab_dir.c_str());
00174       if (dp != NULL){
00175         while (ep = readdir (dp)){
00176           // skip . .. files and temp files with ~
00177           if (ep->d_name[0] == '.' || ep->d_name[strlen(ep->d_name)-1] == '~') 
00178             continue;
00179           if (std::string(ep->d_name).substr(strlen(ep->d_name)-4,4).compare(string(".dab")) == 0)
00180             input_files.push_back((string)sArgs.directory_arg + "/" + ep->d_name);          
00181         }
00182         (void) closedir (dp);           
00183       }
00184 
00185       //TODO: temporary hacking based on implementation ... should improve
00186       // sort so that input file and term array are corresponding with index
00187 
00188       std::sort(input_files.begin(),input_files.end());
00189       for( i = 0; i < input_files.size( ); ++i){
00190         terms.push_back(input_files[i]);
00191         term2weight[input_files[i]] = 1;
00192       }
00193 
00194       //std::sort(terms.begin(),terms.end());
00195       weight_sum = input_files.size( );
00196     }
00197 
00198 
00199     // now iterate dat/dab networks
00200     for( i = 0; i < input_files.size( ); ++i ) {
00201 
00202       // open dat/dab network
00203       if( !DatCur.Open( input_files[ i ].c_str() ) ) {
00204         cerr << "Couldn't open: " << input_files[ i ] << endl;
00205         return 1; }     
00206         cerr << "opened: " << input_files[ i ] << endl;
00207 
00208 
00209         if( sArgs.prior_arg ){
00210           for( j = 0; j < DatCur.GetGenes( ); ++j)
00211             for( k = ( j + 1 ); k < DatCur.GetGenes( ); ++k)
00212               DatCur.Set( j, k, equal_prior( DatCur.Get( j, k ), vpos[ i ], vneg[ i ] ) );
00213 
00214         }
00215 
00216         if( sArgs.logit_flag ){
00217           for( j = 0; j < DatCur.GetGenes( ); ++j)
00218             for( k = ( j + 1 ); k < DatCur.GetGenes( ); ++k)
00219               DatCur.Set( j, k, logit( DatCur.Get( j, k ) ) );
00220 
00221         }else{
00222         }
00223 
00224         if( sArgs.znormalize_flag ){
00225           DatCur.Normalize( CDat::ENormalizeZScore );
00226         }else{
00227         }
00228 
00229         cerr << term2weight[terms[i]] << endl;
00230 
00231 
00232         // if open first network, we will just add edge weights to this CDat
00233         if( i == 0 ){
00234           DatOut.Open( DatCur );  
00235           for( j = 0; j < DatOut.GetGenes( ); ++j)
00236             //cerr << "set " << j << endl;
00237 
00238             for( k = ( j + 1 ); k < DatOut.GetGenes( ); ++k){
00239               //                DatOut.Set( j, k, DatOut.Get( j, k ) );
00240               DatOut.Set( j, k, DatOut.Get( j, k ) * term2weight[terms[i]] );
00241             }
00242           continue;  
00243         }
00244 
00245         //cerr << "map flag" << endl;     
00246         if( sArgs.map_flag ){
00247           // Get gene index match     
00248           veciGenesCur.clear();
00249           veciGenesCur.resize(DatOut.GetGenes());
00250           for( l = 0; l < DatOut.GetGenes(); l++){
00251             veciGenesCur[ l ] = DatCur.GetGene( DatOut.GetGene(l) );
00252             if( veciGenesCur[ l ] == -1 ){
00253               cerr << "ERROR: missing gene" << input_files[ l ] << endl;
00254               return 1;       
00255             }
00256           }
00257         }
00258 
00259         cerr << "add a network" << endl;
00260 
00261         // now add edges to Dat
00262         for( j = 0; j < DatOut.GetGenes( ); ++j )
00263           for( k = ( j + 1 ); k < DatOut.GetGenes( ); ++k ) {
00264 
00265             if( sArgs.map_flag ){
00266               // we are assuming a fully connected network
00267               if( CMeta::IsNaN( d = DatCur.Get( veciGenesCur[ j ], veciGenesCur[ k ] ) ) ){
00268                 cerr << d << endl;
00269                 cerr << veciGenesCur[ j ] << endl;
00270                 cerr << veciGenesCur[ k ] << endl;
00271                 cerr << "ERROR: missing values" << input_files[ i ] << endl;
00272                 return 1;
00273               }
00274             }
00275             else{
00276               if( CMeta::IsNaN( d = DatCur.Get(  j, k ) ) ){
00277                 cerr << "ERROR: missing values" << input_files[ i ] << endl;
00278                 return 1;
00279               }
00280             }
00281 
00282             //cerr << d << endl;
00283             //cerr << terms.size() << endl;
00284             //for(int q = 0 ; q < terms.size() ; q++)
00285             //  cerr << terms[q] << endl;
00286             //cerr << terms[i] << endl;
00287             //cerr << term2weight[terms[i]] << endl;          
00288 
00289             DatOut.Set( j, k, DatOut.Get( j, k ) + d * term2weight[terms[i]]) ; //weight edge based on context weight TODO: right now this implementation depends on the order of terms vector and niput_file vector to be correspondingi
00290             //                DatOut.Set( j, k, DatOut.Get( j, k ) +  d ) ;//
00291 
00292 
00293           }
00294     }
00295 
00296     // now convert sum to mean
00297     for( j = 0; j < DatOut.GetGenes( ); ++j )
00298       for( k = ( j + 1 ); k < DatOut.GetGenes( ); ++k ){
00299         DatOut.Set( j, k, DatOut.Get( j, k ) / weight_sum );
00300 
00301         if( sArgs.logit_flag ){// transform back to probability values
00302           DatOut.Set( j, k, sigmoid( DatOut.Get( j, k ) ) );
00303         }
00304 
00305       }
00306     //DatOut.Set( j, k, DatOut.Get( j, k ) / input_files.size( ) );
00307 
00308     DatOut.Save( sArgs.output_arg );
00309     return iRet; 
00310 }