Sleipnir
tools/Matcher/Matcher.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 static const char   c_acDAB[]   = ".dab";
00026 
00027 bool read_values( const CDat& Dat, size_t iMin, size_t iMax, vector<float>& vecdValues ) {
00028     size_t  i, j, iCount;
00029     float   d;
00030 
00031     for( iCount = i = 0; i < Dat.GetGenes( ); ++i )
00032         for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j ) {
00033             if( CMeta::IsNaN( d = Dat.Get( i, j ) ) )
00034                 continue;
00035             iCount++;
00036             if( vecdValues.size( ) < iMax )
00037                 vecdValues.push_back( d );
00038             else if( ( (float)rand( ) / RAND_MAX ) < ( (float)iMax / iCount ) )
00039                 vecdValues[ rand( ) % vecdValues.size( ) ] = d; }
00040     if( !( i = vecdValues.size( ) ) )
00041         return false;
00042     while( vecdValues.size( ) < iMin )
00043         vecdValues.push_back( vecdValues[ rand( ) % i ] );
00044     sort( vecdValues.begin( ), vecdValues.end( ) );
00045 
00046     return true; }
00047 
00048 int main( int iArgs, char** aszArgs ) {
00049     gengetopt_args_info         sArgs;
00050     IMeasure*                   pMeasure;
00051     CMeasurePearson             Pearson;
00052     CMeasureEuclidean           Euclidean;
00053     CMeasureKendallsTau         KendallsTau;
00054     CMeasureKolmogorovSmirnov   KolmSmir;
00055     CMeasureHypergeometric      Hypergeom;
00056     CMeasureQuickPearson        PearQuick;
00057     CMeasureInnerProduct        InnerProd;
00058     CMeasureBinaryInnerProduct  BinInnerProd;
00059     size_t                      i, iTwo;
00060     string                      strFile;
00061 
00062     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00063         cmdline_parser_print_help( );
00064         return 1; }
00065     CMeta Meta( sArgs.verbosity_arg, sArgs.random_arg );
00066 
00067     CMeasureSigmoid             EuclideanSig( &Euclidean, false, 1.0f / sArgs.inputs_num );
00068     IMeasure*                   apMeasures[]    = { &Pearson, &EuclideanSig, &KendallsTau,
00069         &KolmSmir, &Hypergeom, &PearQuick, &InnerProd, &BinInnerProd, NULL };
00070 
00071     pMeasure = NULL;
00072     if( sArgs.distance_arg )
00073         for( i = 0; apMeasures[ i ]; ++i )
00074             if( !strcmp( apMeasures[ i ]->GetName( ), sArgs.distance_arg ) ) {
00075                 pMeasure = apMeasures[ i ];
00076                 break; }
00077     if( !pMeasure ) {
00078         cerr << "Could not recognize: " << sArgs.distance_arg << endl;
00079         return 1; }
00080 
00081     if( sArgs.table_flag ) {
00082         for( i = 0; i < sArgs.inputs_num; ++i )
00083             cout << '\t' << sArgs.inputs[ i ];
00084         cout << endl; }
00085     if( !sArgs.inputs_num )
00086         return 0;
00087     FOR_EACH_DIRECTORY_FILE((string)sArgs.input_arg, strFile)
00088         CDat            DatOne;
00089         vector<float>   vecdOne;
00090 
00091         if( !CMeta::IsExtension( strFile, c_acDAB ) )
00092             continue;
00093 
00094         strFile = (string)sArgs.input_arg + '/' + strFile;
00095         if( !DatOne.Open( strFile.c_str( ), !!sArgs.memmap_flag ) ) {
00096             cerr << "Could not open: " << strFile << endl;
00097             return 1; }
00098         if( !read_values( DatOne, sArgs.size_min_arg, sArgs.size_max_arg, vecdOne ) )
00099             continue;
00100 
00101         for( iTwo = 0; iTwo < sArgs.inputs_num; ++iTwo ) {
00102             CDat            DatTwo;
00103             vector<float>   vecdTwo;
00104 
00105             if( !DatTwo.Open( sArgs.inputs[ iTwo ], !!sArgs.memmap_flag ) ) {
00106                 cerr << "Could not open: " << sArgs.inputs[ iTwo ] << endl;
00107                 return 1; }
00108             if( sArgs.table_flag )
00109                 cout << '\t';
00110             if( !read_values( DatTwo, sArgs.size_min_arg, sArgs.size_max_arg, vecdTwo ) )
00111                 continue;
00112             if( !sArgs.table_flag )
00113                 cout << strFile << '\t' << sArgs.inputs[ iTwo ] << '\t';
00114             cout << pMeasure->Measure( &vecdOne.front( ), vecdOne.size( ), &vecdTwo.front( ), vecdTwo.size( ),
00115                 IMeasure::EMapNone );
00116             if( !sArgs.table_flag )
00117                 cout << endl; }
00118         if( sArgs.table_flag )
00119             cout << endl; }
00120 
00121     return 0; }