Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 static const size_t c_iBuffer = 1048576; 00026 static const char c_acBases[] = {'T', 'A', 'C', 'G', 'N'}; 00027 00028 static size_t ngram( const string&, size_t, size_t ); 00029 00030 static inline size_t ngram( char c ) { 00031 size_t i; 00032 00033 for( i = 0; i < ARRAYSIZE(c_acBases); ++i ) 00034 if( c == c_acBases[ i ] ) 00035 return i; 00036 00037 cerr << c << endl; 00038 exit( 1 ); } 00039 00040 static inline size_t ipow( size_t iBase, size_t iExp ) { 00041 size_t i, iRet; 00042 00043 for( iRet = 1,i = 0; i < iExp; ++i ) 00044 iRet *= iBase; 00045 00046 return iRet; } 00047 00048 int main( int iArgs, char** aszArgs ) { 00049 gengetopt_args_info sArgs; 00050 char* acLine; 00051 ifstream ifsm; 00052 CDat Dat; 00053 size_t i, j, k, m, iGene, iNgrams, iNgram, iPlaces; 00054 vector<vector<string> > vecvecstrSeqs; 00055 vector<size_t> veciCounts; 00056 char* szSeq; 00057 float* adOne; 00058 float* adTwo; 00059 CGenome Genome; 00060 CGenes GenesIn( Genome ); 00061 CMeasureEuclidean Euclidean; 00062 00063 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00064 cmdline_parser_print_help( ); 00065 return 1; } 00066 CMeta Meta( sArgs.verbosity_arg ); 00067 00068 iPlaces = ipow( ARRAYSIZE(c_acBases), sArgs.n_arg - 1 ); 00069 iNgrams = iPlaces * ARRAYSIZE(c_acBases); 00070 00071 CMeasureSigmoid EuclideanSig( &Euclidean, false, 1.0f / iNgrams ); 00072 00073 if( sArgs.genes_arg ) { 00074 ifsm.open( sArgs.genes_arg ); 00075 if( !GenesIn.Open( ifsm ) ) { 00076 cerr << "Could not open: " << sArgs.genes_arg << endl; 00077 return 1; } 00078 ifsm.close( ); } 00079 00080 if( sArgs.input_arg ) { 00081 ifsm.clear( ); 00082 ifsm.open( sArgs.input_arg ); } 00083 { 00084 istream& istm = sArgs.input_arg ? (istream&)ifsm : cin; 00085 map<string,size_t> mapGenes; 00086 map<string,size_t>::const_iterator iterGene; 00087 vector<string> vecstrGenes; 00088 00089 acLine = new char[ c_iBuffer ]; 00090 while( !istm.eof( ) ) { 00091 istm.getline( acLine, c_iBuffer - 1 ); 00092 if( !( szSeq = strchr( acLine, '\t' ) ) ) 00093 continue; 00094 *szSeq = 0; 00095 00096 if( GenesIn.GetGenes( ) && !GenesIn.IsGene( acLine ) ) 00097 continue; 00098 00099 if( ( iterGene = mapGenes.find( acLine ) ) == mapGenes.end( ) ) { 00100 iGene = mapGenes.size( ); 00101 mapGenes[ acLine ] = iGene; 00102 vecstrGenes.push_back( acLine ); 00103 veciCounts.push_back( 1 ); 00104 vecvecstrSeqs.push_back( vector<string>( ) ); } 00105 else 00106 veciCounts[ iGene = iterGene->second ]++; 00107 vecvecstrSeqs[ iGene ].push_back( ++szSeq ); } 00108 delete[] acLine; 00109 Dat.Open( vecstrGenes ); 00110 } 00111 if( sArgs.input_arg ) 00112 ifsm.close( ); 00113 00114 adOne = new float[ iNgrams ]; 00115 adTwo = new float[ iNgrams ]; 00116 for( i = 0; i < Dat.GetGenes( ); ++i ) { 00117 cerr << "Gene " << i << '/' << Dat.GetGenes( ) << endl; 00118 00119 memset( adOne, 0, iNgrams * sizeof(*adOne) ); 00120 for( j = 0; j < vecvecstrSeqs[ i ].size( ); ++j ) { 00121 const string& strSeq = vecvecstrSeqs[ i ][ j ]; 00122 00123 adOne[ iNgram = ngram( strSeq, 0, sArgs.n_arg ) ]++; 00124 for( k = 1; k < ( strSeq.length( ) - sArgs.n_arg ); ++k ) { 00125 iNgram = ( ( iNgram - ( ngram( strSeq[ k - 1 ] ) * iPlaces ) ) * ARRAYSIZE(c_acBases) ) + 00126 ngram( strSeq[ k + sArgs.n_arg - 1 ] ); 00127 adOne[ iNgram ]++; } } 00128 for( j = 0; j < iNgrams; ++j ) 00129 if( adOne[ j ] ) 00130 adOne[ j ] /= veciCounts[ i ]; 00131 00132 for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j ) { 00133 memset( adTwo, 0, iNgrams * sizeof(*adTwo) ); 00134 for( k = 0; k < vecvecstrSeqs[ j ].size( ); ++k ) { 00135 const string& strSeq = vecvecstrSeqs[ j ][ k ]; 00136 00137 adTwo[ iNgram = ngram( strSeq, 0, sArgs.n_arg ) ]++; 00138 for( m = 1; m < ( strSeq.length( ) - sArgs.n_arg ); ++m ) { 00139 iNgram = ( ( iNgram - ( ngram( strSeq[ m - 1 ] ) * iPlaces ) ) * ARRAYSIZE(c_acBases) ) + 00140 ngram( strSeq[ m + sArgs.n_arg - 1 ] ); 00141 adTwo[ iNgram ]++; } } 00142 for( k = 0; k < iNgrams; ++k ) 00143 if( adTwo[ k ] ) 00144 adTwo[ k ] /= veciCounts[ j ]; 00145 00146 Dat.Set( i, j, (float)EuclideanSig.Measure( adOne, iNgrams, adTwo, iNgrams, IMeasure::EMapNone, NULL, NULL ) ); 00147 } } 00148 delete[] adOne; 00149 delete[] adTwo; 00150 00151 if( sArgs.normalize_flag || sArgs.zscore_flag ) 00152 Dat.Normalize( sArgs.zscore_flag ? CDat::ENormalizeZScore : CDat::ENormalizeMinMax ); 00153 00154 Dat.Save( sArgs.output_arg ); 00155 00156 return 0; } 00157 00158 size_t ngram( const string& strSeq, size_t iOff, size_t iLen ) { 00159 size_t i, iRet; 00160 00161 for( iRet = i = 0; i < iLen; ++i ) 00162 iRet = ( iRet * ARRAYSIZE(c_acBases) ) + ngram( strSeq[ iOff + i ] ); 00163 00164 return iRet; }