Sleipnir
tools/Data2Features/Data2Features.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 static const char   c_szERROR[]     = "ERROR";
00026 static const char   c_szAnnotated[] = "annotated";
00027 static const char   c_szValue[]     = "value";
00028 static const char   c_cComment      = '#';
00029 static const char   c_cDot          = '.';
00030 static const size_t c_iBuf          = 1024;
00031 
00032 struct SFeature {
00033     string          m_strName;
00034     vector<string>  m_vecstrValues;
00035     size_t          m_iDefault;
00036 
00037     SFeature( const string& strName ) : m_strName(strName), m_iDefault(-1) { }
00038 
00039     size_t quantize( const string& strValue ) const {
00040         size_t  i;
00041 
00042         for( i = 0; i < m_vecstrValues.size( ); ++i )
00043             if( strValue == m_vecstrValues[ i ] )
00044                 return i;
00045 
00046         return -1; }
00047 };
00048 
00049 struct SDatum {
00050     string              m_strName;
00051     map<size_t,size_t>  m_mapiiFeatures;
00052 };
00053 
00054 int main( int iArgs, char** aszArgs ) {
00055     gengetopt_args_info sArgs;
00056     size_t              i, j, k;
00057     vector<SFeature>    vecsFeatures;
00058     vector<string>      vecstrLine, vecstrToken;
00059     ifstream            ifsm;
00060     char                szBuf[ c_iBuf ];
00061     map<string,SDatum>  mapValues;
00062     CGenome             Genome;
00063     CGenes              GenesPos( Genome );
00064 //  vector<float>       vecdQuants;
00065 
00066     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00067         cmdline_parser_print_help( );
00068         return 1; }
00069     CMeta Meta( sArgs.verbosity_arg );
00070 
00071     ifsm.open( sArgs.environment_arg );
00072     if( !ifsm.is_open( ) ) {
00073         cerr << "Could not open: " << sArgs.environment_arg << endl;
00074         return 1; }
00075     while( ifsm.peek( ) != EOF ) {
00076         ifsm.getline( szBuf, c_iBuf - 1 );
00077         vecstrLine.clear( );
00078         CMeta::Tokenize( szBuf, vecstrLine );
00079         if( ( vecstrLine.size( ) != 0 ) && ( vecstrLine[ 0 ][ 0 ] != c_cComment ) ) {
00080             vecsFeatures.push_back( SFeature( vecstrLine[ 0 ] ) );
00081             {
00082                 SFeature&   sFeature    = vecsFeatures[ vecsFeatures.size( ) - 1 ];
00083 
00084                 vecstrToken.clear( );
00085                 CMeta::Tokenize( vecstrLine[ 1 ].c_str( ), vecstrToken, "|" );
00086                 sFeature.m_vecstrValues.resize( vecstrToken.size( ) );
00087                 copy( vecstrToken.begin( ), vecstrToken.end( ), sFeature.m_vecstrValues.begin( ) );
00088 
00089                 if( vecstrLine.size( ) > 2 )
00090                     sFeature.m_iDefault = sFeature.quantize( vecstrLine[ 2 ] );
00091             } } }
00092     ifsm.close( );
00093 
00094     ifsm.clear( );
00095     ifsm.open( sArgs.data_arg );
00096     if( !ifsm.is_open( ) ) {
00097         cerr << "Could not open: " << sArgs.data_arg << endl;
00098         return 1; }
00099     while( ifsm.peek( ) != EOF ) {
00100         ifsm.getline( szBuf, c_iBuf - 1 );
00101         vecstrLine.clear( );
00102         CMeta::Tokenize( szBuf, vecstrLine );
00103         if( vecstrLine.size( ) == 0 )
00104             continue;
00105         {
00106             SDatum& sCur    = mapValues[ vecstrLine[ 0 ] ];
00107             char*   pc;
00108 
00109             sCur.m_strName = vecstrLine[ 0 ];
00110             for( i = 1; i < vecstrLine.size( ); ++i ) {
00111                 vecstrToken.clear( );
00112                 CMeta::Tokenize( vecstrLine[ i ].c_str( ), vecstrToken, "|" );
00113                 if( vecstrToken.size( ) != 2 ) {
00114                     cerr << "Illegal token in " << sArgs.data_arg << ": " << szBuf << endl;
00115                     return 1; }
00116                 for( j = 0; j < vecsFeatures.size( ); ++j )
00117                     if( vecstrToken[ 0 ] == vecsFeatures[ j ].m_strName )
00118                         break;
00119                 if( j >= vecsFeatures.size( ) ) {
00120                     cerr << "Unknown feature: " << vecstrLine[ i ] << endl;
00121                     return 1; }
00122                 k = strtol( vecstrToken[ 1 ].c_str( ), &pc, 10 );
00123                 sCur.m_mapiiFeatures[ j ] = ( pc != vecstrToken[ 1 ].c_str( )) ? k :
00124                     vecsFeatures[ j ].quantize( vecstrToken[ 1 ] ); }
00125         } }
00126     ifsm.close( );
00127 
00128 /*
00129     ifsm.clear( );
00130     ifsm.open( sArgs.quants_arg );
00131     if( !ifsm.is_open( ) ) {
00132         cerr << "Could not open: " << sArgs.quants_arg << endl;
00133         return 1; }
00134     ifsm.getline( szBuf, c_iBuf - 1 );
00135     vecstrLine.clear( );
00136     CMeta::Tokenize( szBuf, vecstrLine );
00137     vecdQuants.resize( vecstrLine.size( ) );
00138     for( i = 0; i < vecdQuants.size( ); ++i )
00139         vecdQuants[ i ] = (float)atof( vecstrLine[ i ].c_str( ) );
00140     ifsm.close( );
00141 */
00142 
00143     if( sArgs.genome_arg ) {
00144         ifsm.clear( );
00145         ifsm.open( sArgs.genome_arg );
00146         if( !Genome.Open( ifsm ) ) {
00147             cerr << "Could not open: " << sArgs.genome_arg << endl;
00148             return 1; }
00149         ifsm.close( ); }
00150 
00151     if( sArgs.positives_arg ) {
00152         ifsm.clear( );
00153         ifsm.open( sArgs.positives_arg ); }
00154     if( !GenesPos.Open( sArgs.positives_arg ? ifsm : cin ) ) {
00155         cerr << "Could not open: " << ( sArgs.positives_arg ? sArgs.positives_arg : "input genes" ) << endl;
00156         return 1; }
00157     if( sArgs.positives_arg )
00158         ifsm.close( );
00159     ifsm.clear( );
00160 
00161     cout << "<?xml version='1.0' encoding='utf-8'?>" << endl;
00162     cout << "<dataset name='datasets'>" << endl;
00163     cout << "  <header>" << endl;
00164     cout << "    <attributes>" << endl;
00165 /*
00166     cout << "      <attribute name='annotated' type='nominal'>" << endl;
00167     cout << "        <labels>" << endl;
00168     cout << "          <label>1</label>" << endl;
00169     cout << "          <label>2</label>" << endl;
00170     cout << "        </labels>" << endl;
00171     cout << "      </attribute>" << endl;
00172 */
00173     cout << "      <attribute name='value' type='numeric'/>" << endl;
00174     for( i = 0; i < vecsFeatures.size( ); ++i ) {
00175         cout << "      <attribute name='" << vecsFeatures[ i ].m_strName << "' type='nominal'>" << endl;
00176         cout << "        <labels>" << endl;
00177         for( j = 0; j < vecsFeatures[ i ].m_vecstrValues.size( ); ++j )
00178             cout << "          <label>" << ( j + 1 ) << "</label>" << endl;
00179         cout << "        </labels>" << endl;
00180         cout << "      </attribute>" << endl; }
00181 /*
00182     for( i = 0; i < Genome.GetGenes( ); ++i ) {
00183         cout << "      <attribute name='" << Genome.GetGene( i ).GetName( ) << "' type='nominal'>" << endl;
00184         cout << "        <labels>" << endl;
00185         cout << "          <label>1</label>" << endl;
00186         cout << "          <label>2</label>" << endl;
00187         cout << "        </labels>" << endl;
00188         cout << "      </attribute>" << endl; }
00189 */
00190     cout << "    </attributes>" << endl;
00191     cout << "  </header>" << endl;
00192 
00193     cout << "  <body>" << endl;
00194     cout << "    <instances>" << endl;
00195     for( i = 0; i < sArgs.inputs_num; ++i ) {
00196         CPCL                PCL;
00197         CDat                Dat;
00198         size_t              iCount;
00199         vector<size_t>      veciGenes;
00200         float               d, dAverage;
00201         map<size_t,size_t>  mapiiCur;
00202         int                 iRet;
00203         string              strName;
00204 
00205         cerr << "Processing: " << sArgs.inputs[ i ] << endl;
00206 
00207         strName = CMeta::Basename( sArgs.inputs[ i ] );
00208         while( ( j = strName.rfind( c_cDot ) ) != string::npos )
00209             strName = strName.substr( 0, j );
00210         const SDatum&   sDatum  = mapValues[ strName ];
00211 
00212         if( !Dat.Open( sArgs.inputs[ i ], !!sArgs.memmap_flag ) && ( iRet = CPCL::Distance( sArgs.inputs[ i ],
00213             sArgs.skip_arg, sArgs.distance_arg, !!sArgs.normalize_flag, !!sArgs.zscore_flag, false,
00214             sArgs.genome_arg, CMeta::GetNaN( ), -1, PCL, Dat ) ) ) {
00215             cerr << "Could not open: " << sArgs.inputs[ i ] << endl;
00216             cmdline_parser_print_help( );
00217             return iRet; }
00218 
00219         veciGenes.resize( GenesPos.GetGenes( ) );
00220         for( j = 0; j < veciGenes.size( ); ++j )
00221             veciGenes[ j ] = Dat.GetGene( GenesPos.GetGene( j ).GetName( ) );
00222         dAverage = 0;
00223         for( iCount = j = 0; j < veciGenes.size( ); ++j ) {
00224             if( veciGenes[ j ] == -1 )
00225                 continue;
00226             for( k = ( j + 1 ); k < veciGenes.size( ); ++k )
00227                 if( ( veciGenes[ k ] != -1 ) &&
00228                     !CMeta::IsNaN( d = Dat.Get( veciGenes[ j ], veciGenes[ k ] ) ) ) {
00229                     iCount++;
00230                     dAverage += d; } }
00231         if( iCount )
00232             dAverage /= iCount;
00233 /*
00234         adCentroid = new float[ PCL.GetExperiments( ) ];
00235         for( iCount = i = 0; i < GenesPos.GetGenes( ); ++i ) {
00236             if( ( j = PCL.GetGene( GenesPos.GetGene( i ).GetName( ) ) ) == -1 )
00237                 continue;
00238             iCount++;
00239             for( k = 0; k < PCL.GetExperiments( ); ++k )
00240                 adCentroid[ k ] += PCL.Get( j, k ); }
00241         for( i = 0; i < PCL.GetExperiments( ); ++i )
00242             adCentroid[ i ] /= iCount;
00243 */
00244 
00245         cout << "      <instance>" << endl;
00246         cout << "        <value>" << dAverage << "</value>" << endl;
00247         for( j = 0; j < vecsFeatures.size( ); ++j ) {
00248             const SFeature&                     sFeature    = vecsFeatures[ j ];
00249             map<size_t,size_t>::const_iterator  iterCur;
00250             size_t                              iCur;
00251 
00252             if( ( iterCur = sDatum.m_mapiiFeatures.find( j ) ) != sDatum.m_mapiiFeatures.end( ) )
00253                 iCur = iterCur->second;
00254             else
00255                 iCur = sFeature.m_iDefault;
00256             cout << "        <value>" << ( iCur + 1 ) << "</value>" << endl; }
00257         cout << "      </instance>" << endl; }
00258     cout << "    </instances>" << endl;
00259     cout << "  </body>" << endl;
00260     cout << "</dataset>" << endl;
00261 
00262     return 0; }