Sleipnir
tools/Seqs2Ngrams/Seqs2Ngrams.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 static const size_t c_iBuffer   = 1048576;
00026 static const char   c_acBases[] = {'T', 'A', 'C', 'G', 'N'};
00027 
00028 static size_t ngram( const string&, size_t, size_t );
00029 
00030 static inline size_t ngram( char c ) {
00031     size_t  i;
00032 
00033     for( i = 0; i < ARRAYSIZE(c_acBases); ++i )
00034         if( c == c_acBases[ i ] )
00035             return i;
00036 
00037     cerr << c << endl;
00038     exit( 1 ); }
00039 
00040 static inline size_t ipow( size_t iBase, size_t iExp ) {
00041     size_t  i, iRet;
00042 
00043     for( iRet = 1,i = 0; i < iExp; ++i )
00044         iRet *= iBase;
00045 
00046     return iRet; }
00047 
00048 int main( int iArgs, char** aszArgs ) {
00049     gengetopt_args_info     sArgs;
00050     char*                   acLine;
00051     ifstream                ifsm;
00052     CDat                    Dat;
00053     size_t                  i, j, k, m, iGene, iNgrams, iNgram, iPlaces;
00054     vector<vector<string> > vecvecstrSeqs;
00055     vector<size_t>          veciCounts;
00056     char*                   szSeq;
00057     float*                  adOne;
00058     float*                  adTwo;
00059     CGenome                 Genome;
00060     CGenes                  GenesIn( Genome );
00061     CMeasureEuclidean       Euclidean;
00062 
00063     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00064         cmdline_parser_print_help( );
00065         return 1; }
00066     CMeta Meta( sArgs.verbosity_arg );
00067 
00068     iPlaces = ipow( ARRAYSIZE(c_acBases), sArgs.n_arg - 1 );
00069     iNgrams = iPlaces * ARRAYSIZE(c_acBases);
00070 
00071     CMeasureSigmoid         EuclideanSig( &Euclidean, false, 1.0f / iNgrams );
00072 
00073     if( sArgs.genes_arg ) {
00074         ifsm.open( sArgs.genes_arg );
00075         if( !GenesIn.Open( ifsm ) ) {
00076             cerr << "Could not open: " << sArgs.genes_arg << endl;
00077             return 1; }
00078         ifsm.close( ); }
00079 
00080     if( sArgs.input_arg ) {
00081         ifsm.clear( );
00082         ifsm.open( sArgs.input_arg ); }
00083     {
00084         istream&                            istm    = sArgs.input_arg ? (istream&)ifsm : cin;
00085         map<string,size_t>                  mapGenes;
00086         map<string,size_t>::const_iterator  iterGene;
00087         vector<string>                      vecstrGenes;
00088 
00089         acLine = new char[ c_iBuffer ];
00090         while( !istm.eof( ) ) {
00091             istm.getline( acLine, c_iBuffer - 1 );
00092             if( !( szSeq = strchr( acLine, '\t' ) ) )
00093                 continue;
00094             *szSeq = 0;
00095 
00096             if( GenesIn.GetGenes( ) && !GenesIn.IsGene( acLine ) )
00097                 continue;
00098 
00099             if( ( iterGene = mapGenes.find( acLine ) ) == mapGenes.end( ) ) {
00100                 iGene = mapGenes.size( );
00101                 mapGenes[ acLine ] = iGene;
00102                 vecstrGenes.push_back( acLine );
00103                 veciCounts.push_back( 1 );
00104                 vecvecstrSeqs.push_back( vector<string>( ) ); }
00105             else
00106                 veciCounts[ iGene = iterGene->second ]++;
00107             vecvecstrSeqs[ iGene ].push_back( ++szSeq ); }
00108         delete[] acLine;
00109         Dat.Open( vecstrGenes );
00110     }
00111     if( sArgs.input_arg )
00112         ifsm.close( );
00113 
00114     adOne = new float[ iNgrams ];
00115     adTwo = new float[ iNgrams ];
00116     for( i = 0; i < Dat.GetGenes( ); ++i ) {
00117         cerr << "Gene " << i << '/' << Dat.GetGenes( ) << endl;
00118 
00119         memset( adOne, 0, iNgrams * sizeof(*adOne) );
00120         for( j = 0; j < vecvecstrSeqs[ i ].size( ); ++j ) {
00121             const string&   strSeq  = vecvecstrSeqs[ i ][ j ];
00122 
00123             adOne[ iNgram = ngram( strSeq, 0, sArgs.n_arg ) ]++;
00124             for( k = 1; k < ( strSeq.length( ) - sArgs.n_arg ); ++k ) {
00125                 iNgram = ( ( iNgram - ( ngram( strSeq[ k - 1 ] ) * iPlaces ) ) * ARRAYSIZE(c_acBases) ) +
00126                     ngram( strSeq[ k + sArgs.n_arg - 1 ] );
00127                 adOne[ iNgram ]++; } }
00128         for( j = 0; j < iNgrams; ++j )
00129             if( adOne[ j ] )
00130                 adOne[ j ] /= veciCounts[ i ];
00131 
00132         for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j ) {
00133             memset( adTwo, 0, iNgrams * sizeof(*adTwo) );
00134             for( k = 0; k < vecvecstrSeqs[ j ].size( ); ++k ) {
00135                 const string&   strSeq  = vecvecstrSeqs[ j ][ k ];
00136 
00137                 adTwo[ iNgram = ngram( strSeq, 0, sArgs.n_arg ) ]++;
00138                 for( m = 1; m < ( strSeq.length( ) - sArgs.n_arg ); ++m ) {
00139                     iNgram = ( ( iNgram - ( ngram( strSeq[ m - 1 ] ) * iPlaces ) ) * ARRAYSIZE(c_acBases) ) +
00140                         ngram( strSeq[ m + sArgs.n_arg - 1 ] );
00141                     adTwo[ iNgram ]++; } }
00142             for( k = 0; k < iNgrams; ++k )
00143                 if( adTwo[ k ] )
00144                     adTwo[ k ] /= veciCounts[ j ];
00145 
00146             Dat.Set( i, j, (float)EuclideanSig.Measure( adOne, iNgrams, adTwo, iNgrams, IMeasure::EMapNone, NULL, NULL ) );
00147         } }
00148     delete[] adOne;
00149     delete[] adTwo;
00150 
00151     if( sArgs.normalize_flag || sArgs.zscore_flag )
00152         Dat.Normalize( sArgs.zscore_flag ? CDat::ENormalizeZScore : CDat::ENormalizeMinMax );
00153 
00154     Dat.Save( sArgs.output_arg );
00155 
00156     return 0; }
00157 
00158 size_t ngram( const string& strSeq, size_t iOff, size_t iLen ) {
00159     size_t  i, iRet;
00160 
00161     for( iRet = i = 0; i < iLen; ++i )
00162         iRet = ( iRet * ARRAYSIZE(c_acBases) ) + ngram( strSeq[ iOff + i ] );
00163 
00164     return iRet; }