Sleipnir
tools/Funcifier/Funcifier.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 enum EShared {
00026     ESharedIgnore   = 0,
00027     ESharedDiscard  = ESharedIgnore + 1,
00028     ESharedOneOnly  = ESharedDiscard + 1
00029 };
00030 
00031 static const char*  c_aszShared[]   = {"ignore", "discard", "oneonly"};
00032 
00033 int main( int iArgs, char** aszArgs ) {
00034     gengetopt_args_info     sArgs;
00035     CDat                    DatIn;
00036     size_t                  iF1, iF2, i, j, iOne, iTwo, iCountIn, iCountOut, iSharedOne, iSharedTwo;
00037     CGenome                 Genome;
00038     vector<CGenes*>         vecpGenes;
00039     vector<string>          vecstrNames;
00040     vector<vector<size_t> > vecveciGenes;
00041     vector<vector<float> >  vecvecdGenes;
00042     float                   d, dAveIn, dAveOut, dOne, dTwo;
00043     double                  dCountIn;
00044     ofstream                ofsm;
00045     EShared                 eShared;
00046 
00047     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00048         cmdline_parser_print_help( );
00049         return 1; }
00050     CMeta Meta( sArgs.verbosity_arg );
00051 
00052     for( i = 0; i < ARRAYSIZE(c_aszShared); ++i )
00053         if( !strcmp( sArgs.shared_arg, c_aszShared[ i ] ) )
00054             break;
00055     if( i >= ARRAYSIZE(c_aszShared) ) {
00056         cmdline_parser_print_help( );
00057         return 1; }
00058     eShared = (EShared)i;
00059 
00060     if( !DatIn.Open( sArgs.input_arg, sArgs.memmap_flag && !sArgs.normalize_flag ) ) {
00061         cerr << "Could not open: " << sArgs.input_arg << endl;
00062         return 1; }
00063     if( sArgs.normalize_flag )
00064         DatIn.Normalize( CDat::ENormalizeSigmoid );
00065 
00066     vecpGenes.resize( sArgs.inputs_num );
00067     vecstrNames.resize( vecpGenes.size( ) );
00068     vecveciGenes.resize( vecpGenes.size( ) );
00069     for( i = 0; i < vecpGenes.size( ); ++i ) {
00070         vecstrNames[ i ] = CMeta::Basename( sArgs.inputs[ i ] );
00071         while( ( j = vecstrNames[ i ].rfind( '.' ) ) != string::npos )
00072             vecstrNames[ i ] = vecstrNames[ i ].substr( 0, j );
00073         vecpGenes[ i ] = new CGenes( Genome );
00074         if( !vecpGenes[ i ]->Open( sArgs.inputs[ i ] ) ) {
00075             cerr << "Could not open: " << sArgs.inputs[ i ] << endl;
00076             return 1; }
00077 
00078         vecveciGenes[ i ].resize( vecpGenes[ i ]->GetGenes( ) );
00079         for( j = 0; j < vecpGenes[ i ]->GetGenes( ); ++j )
00080             vecveciGenes[ i ][ j ] = DatIn.GetGene( vecpGenes[ i ]->GetGene( j ).GetName( ) ); }
00081     if( sArgs.weights_arg ) {
00082         CPCL    PCLWeights;
00083 
00084         vecvecdGenes.resize( vecpGenes.size( ) );
00085         if( !PCLWeights.Open( sArgs.weights_arg, 0 ) ) {
00086             cerr << "Could not open: " << sArgs.weights_arg << endl;
00087             return 1; }
00088         for( i = 0; i < vecstrNames.size( ); ++i ) {
00089             vecvecdGenes[i].resize( vecveciGenes[i].size( ) );
00090             if( ( iOne = PCLWeights.GetExperiment( vecstrNames[i] ) ) == -1 ) {
00091                 cerr << "Could not find gene set weight: " << vecstrNames[i] << endl;
00092                 fill( vecvecdGenes[i].begin( ), vecvecdGenes[i].end( ), 0.0f );
00093                 continue; }
00094             for( j = 0; j < vecpGenes[i]->GetGenes( ); ++j )
00095                 vecvecdGenes[i][j] = ( ( iTwo = PCLWeights.GetGene( vecpGenes[i]->GetGene( j ).GetName( ) ) ) == -1 ) ?
00096                     0 : PCLWeights.Get( iTwo, iOne ); } }
00097 
00098     {
00099         CDat    DatOut;
00100 
00101         DatOut.Open( vecstrNames );
00102         for( iF1 = 0; iF1 < vecveciGenes.size( ); ++iF1 )
00103             for( iF2 = ( iF1 + 1 ); iF2 < vecveciGenes.size( ); ++iF2 ) {
00104                 map<const CGene*, size_t>           mappiGenes;
00105                 map<const CGene*, size_t>::iterator iterGene;
00106 
00107                 for( i = 0; i < vecpGenes[ iF1 ]->GetGenes( ); ++i )
00108                     mappiGenes[ &vecpGenes[ iF1 ]->GetGene( i ) ] = 1;
00109                 for( i = 0; i < vecpGenes[ iF2 ]->GetGenes( ); ++i ) {
00110                     const CGene*    pGene   = &vecpGenes[ iF2 ]->GetGene( i );
00111 
00112                     if( ( iterGene = mappiGenes.find( pGene ) ) == mappiGenes.end( ) )
00113                         mappiGenes[ pGene ] = 1;
00114                     else
00115                         iterGene->second++; }
00116 
00117                 dCountIn = dAveIn = 0;
00118                 for( i = 0; i < vecveciGenes[ iF1 ].size( ); ++i ) {
00119                     if( ( iOne = vecveciGenes[ iF1 ][ i ] ) == -1 )
00120                         continue;
00121                     dOne = vecvecdGenes.empty( ) ? 1 : vecvecdGenes[iF1][i];
00122                     iSharedOne = mappiGenes[ &vecpGenes[ iF1 ]->GetGene( i ) ];
00123                     if( ( eShared == ESharedDiscard ) && ( iSharedOne > 1 ) )
00124                         continue;
00125                     for( j = 0; j < vecveciGenes[ iF2 ].size( ); ++j ) {
00126                         iSharedTwo = mappiGenes[ &vecpGenes[ iF2 ]->GetGene( j ) ];
00127                         switch( eShared ) {
00128                             case ESharedDiscard:
00129                                 if( iSharedTwo > 1 )
00130                                     continue;
00131                                 break;
00132 
00133                             case ESharedOneOnly:
00134                                 if( ( iSharedOne > 1 ) && ( iSharedTwo > 1 ) )
00135                                     continue;
00136                                 break; }
00137 
00138                         if( ( ( iTwo = vecveciGenes[ iF2 ][ j ] ) != -1 ) &&
00139                             !CMeta::IsNaN( d = DatIn.Get( iOne, iTwo ) ) ) {
00140                             dTwo = vecvecdGenes.empty( ) ? 1 : vecvecdGenes[iF2][j];
00141                             dCountIn += dOne * dTwo;
00142                             dAveIn += d * dOne * dTwo; } } }
00143                 DatOut.Set( iF1, iF2, ( dCountIn >= sArgs.minimum_arg ) ? (float)( dAveIn / dCountIn ) : CMeta::GetNaN( ) ); }
00144         if( sArgs.zscore_flag )
00145             DatOut.Normalize( CDat::ENormalizeZScore );
00146         DatOut.Save( sArgs.output_arg );
00147     }
00148 
00149     if( sArgs.colors_arg ) {
00150         ofsm.open( sArgs.colors_arg );
00151         if( !ofsm.is_open( ) ) {
00152             cerr << "Could not open: " << sArgs.colors_arg << endl;
00153             return 1; }
00154         for( i = 0; i < vecpGenes.size( ); ++i ) {
00155             dAveIn = dAveOut = 0;
00156             for( iCountIn = iCountOut = iF1 = 0; iF1 < vecpGenes[ i ]->GetGenes( ); ++iF1 ) {
00157                 if( ( iOne = vecveciGenes[ i ][ iF1 ] ) == -1 )
00158                     continue;
00159                 for( iF2 = ( iF1 + 1 ); iF2 < vecpGenes[ i ]->GetGenes( ); ++iF2 )
00160                     if( ( ( iTwo = vecveciGenes[ i ][ iF2 ] ) != -1 ) &&
00161                         !CMeta::IsNaN( d = DatIn.Get( iOne, iTwo ) ) ) {
00162 //cerr << DatIn.GetGene( iOne ) << '\t' << DatIn.GetGene( iTwo ) << '\t' << d << endl;
00163                         iCountIn++;
00164                         dAveIn += d; }
00165                 for( j = 0; j < DatIn.GetGenes( ); ++j )
00166                     if( !vecpGenes[ i ]->IsGene( DatIn.GetGene( j ) ) &&
00167                         !CMeta::IsNaN( d = DatIn.Get( iOne, j ) ) ) {
00168                         iCountOut++;
00169                         dAveOut += d; } }
00170             dAveIn /= iCountIn;
00171             dAveOut /= iCountOut;
00172             ofsm << ( dAveIn - dAveOut ) << '\t' << dAveIn << '\t' << dAveOut << '\t' <<
00173                 vecstrNames[ i ] << endl; }
00174         ofsm.close( ); }
00175 
00176     for( i = 0; i < vecpGenes.size( ); ++i )
00177         delete vecpGenes[ i ];
00178 
00179     return 0; }