Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 static const char c_szBick[] = "[Bick]"; 00026 static const char c_szBicd[] = "[Bicd]"; 00027 static const char c_szSamba[] = "samba"; 00028 static const char c_szFuzzy[] = "fuzzy"; 00029 static const char c_szList[] = "list"; 00030 static const char c_szParam[] = "param"; 00031 00032 int open_list( istream&, CGenome&, vector<float>&, vector<CGenes*>& ); 00033 int open_samba( istream&, CGenome&, vector<float>&, vector<CGenes*>& ); 00034 int open_param( istream&, CGenome&, vector<float>&, vector<CGenes*>& ); 00035 int open_fuzzy( istream&, size_t, CDat& ); 00036 00037 int main( int iArgs, char** aszArgs ) { 00038 CDat Dat; 00039 ifstream ifsm; 00040 istream* pistm; 00041 gengetopt_args_info sArgs; 00042 CGenome Genome; 00043 vector<float> vecdWeights; 00044 vector<CGenes*> vecpClusters; 00045 vector<string> vecstrGenes; 00046 set<string> setstrGenes; 00047 set<string>::const_iterator iterGene; 00048 vector<vector<size_t> > vecveciGenes; 00049 int iRet; 00050 size_t i, j, iCluster, iGeneOne, iGeneTwo, iOne, iTwo; 00051 CGenes* pClusterOne; 00052 float d, dOne; 00053 map<string, size_t> mapstriGenes; 00054 map<string, size_t>::const_iterator iterIndex; 00055 00056 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00057 cmdline_parser_print_help( ); 00058 return 1; } 00059 CMeta Meta( sArgs.verbosity_arg ); 00060 00061 if( sArgs.input_arg ) { 00062 ifsm.open( sArgs.input_arg ); 00063 pistm = &ifsm; } 00064 else 00065 pistm = &cin; 00066 iRet = 1; 00067 if( !strcmp( c_szSamba, sArgs.type_arg ) ) 00068 iRet = open_samba( *pistm, Genome, vecdWeights, vecpClusters ); 00069 else if( !strcmp( c_szList, sArgs.type_arg ) ) 00070 iRet = open_list( *pistm, Genome, vecdWeights, vecpClusters ); 00071 else if( !strcmp( c_szParam, sArgs.type_arg ) ) 00072 iRet = open_param( *pistm, Genome, vecdWeights, vecpClusters ); 00073 else if( !strcmp( c_szFuzzy, sArgs.type_arg ) ) 00074 iRet = open_fuzzy( *pistm, sArgs.skip_arg, Dat ); 00075 else 00076 cmdline_parser_print_help( ); 00077 if( iRet ) 00078 return iRet; 00079 00080 if( !vecpClusters.size( ) ) { 00081 cerr << "No clusters found!" << endl; 00082 return 1; } 00083 00084 for( i = 0; i < vecpClusters.size( ); ++i ) 00085 for( j = 0; j < vecpClusters[ i ]->GetGenes( ); ++j ) 00086 setstrGenes.insert( vecpClusters[ i ]->GetGene( j ).GetName( ) ); 00087 vecstrGenes.resize( setstrGenes.size( ) ); 00088 for( i = 0,iterGene = setstrGenes.begin( ); iterGene != setstrGenes.end( ); ++i,++iterGene ) 00089 vecstrGenes[ i ] = *iterGene; 00090 00091 Dat.Open( vecstrGenes ); 00092 for( i = 0; i < Dat.GetGenes( ); ++i ) 00093 mapstriGenes[ Dat.GetGene( i ) ] = i; 00094 vecveciGenes.resize( vecpClusters.size( ) ); 00095 for( i = 0; i < vecveciGenes.size( ); ++i ) { 00096 vecveciGenes[ i ].resize( vecpClusters[ i ]->GetGenes( ) ); 00097 for( j = 0; j < vecveciGenes[ i ].size( ); ++j ) 00098 vecveciGenes[ i ][ j ] = ( ( iterIndex = mapstriGenes.find( 00099 vecpClusters[ i ]->GetGene( j ).GetName( ) ) ) == mapstriGenes.end( ) ) ? -1 : 00100 iterIndex->second; } 00101 00102 if( sArgs.size_flag ) 00103 for( iCluster = 0; iCluster < vecpClusters.size( ); ++iCluster ) 00104 vecdWeights[ iCluster ] = 1.0f / vecpClusters[ iCluster ]->GetGenes( ); 00105 if( sArgs.counts_flag ) 00106 for( iCluster = 0; iCluster < vecpClusters.size( ); ++iCluster ) { 00107 const vector<size_t>& veciOne = vecveciGenes[ iCluster ]; 00108 00109 cerr << "Processing cluster " << iCluster << "/" << vecpClusters.size( ) << endl; 00110 pClusterOne = vecpClusters[ iCluster ]; 00111 dOne = vecdWeights[ iCluster ]; 00112 for( iGeneOne = 0; iGeneOne < veciOne.size( ); ++iGeneOne ) { 00113 iOne = veciOne[ iGeneOne ]; 00114 for( iGeneTwo = ( iGeneOne + 1 ); iGeneTwo < veciOne.size( ); ++iGeneTwo ) { 00115 float d; 00116 00117 iTwo = veciOne[ iGeneTwo ]; 00118 d = Dat.Get( iOne, iTwo ); 00119 Dat.Set( iOne, iTwo, ( CMeta::IsNaN( d ) ? 0 : d ) + dOne ); } } } 00120 else { 00121 vector<bool> vecfClustered; 00122 00123 vecfClustered.resize( Dat.GetGenes( ) ); 00124 for( iGeneOne = 0; iGeneOne < vecfClustered.size( ); ++iGeneOne ) { 00125 const string& strGene = Dat.GetGene( iGeneOne ); 00126 00127 for( iCluster = 0; iCluster < vecpClusters.size( ); ++iCluster ) 00128 if( vecpClusters[ iCluster ]->IsGene( strGene ) ) { 00129 vecfClustered[ iGeneOne ] = true; 00130 break; } } 00131 for( iGeneOne = 0; iGeneOne < Dat.GetGenes( ); ++iGeneOne ) { 00132 if( !vecfClustered[ iGeneOne ] ) 00133 continue; 00134 for( iGeneTwo = ( iGeneOne + 1 ); iGeneTwo < Dat.GetGenes( ); ++iGeneTwo ) 00135 if( vecfClustered[ iGeneTwo ] ) 00136 Dat.Set( iGeneOne, iGeneTwo, 0 ); } 00137 for( iCluster = 0; iCluster < vecpClusters.size( ); ++iCluster ) { 00138 const vector<size_t>& veciOne = vecveciGenes[ iCluster ]; 00139 00140 cerr << "Processing cluster " << iCluster << "/" << vecpClusters.size( ) << endl; 00141 pClusterOne = vecpClusters[ iCluster ]; 00142 dOne = vecdWeights[ iCluster ]; 00143 for( iGeneOne = 0; iGeneOne < veciOne.size( ); ++iGeneOne ) { 00144 iOne = veciOne[ iGeneOne ]; 00145 for( iGeneTwo = ( iGeneOne + 1 ); iGeneTwo < veciOne.size( ); ++iGeneTwo ) { 00146 iTwo = veciOne[ iGeneTwo ]; 00147 if( CMeta::IsNaN( d = Dat.Get( iOne, iTwo ) ) || ( dOne > d ) ) 00148 Dat.Set( iOne, iTwo, dOne ); } } } } 00149 00150 Dat.Save( sArgs.output_arg ); 00151 for( i = 0; i < vecpClusters.size( ); ++i ) 00152 delete vecpClusters[ i ]; 00153 00154 return 0; } 00155 00156 struct SSorter { 00157 const vector<float>& m_vecdWeights; 00158 00159 SSorter( const vector<float>& vecdWeights ) : m_vecdWeights(vecdWeights) { } 00160 00161 bool operator()( size_t iOne, size_t iTwo ) { 00162 00163 return ( m_vecdWeights[ iOne ] < m_vecdWeights[ iTwo ] ); } 00164 }; 00165 00166 int open_samba( istream& istm, CGenome& Genome, vector<float>& vecdWeights, vector<CGenes*>& vecpClusters ) { 00167 static const size_t c_iLine = 1024; 00168 char szLine[ c_iLine ]; 00169 vector<string> vecstrLine, vecstrGenes; 00170 CGenes* pCluster; 00171 size_t iCluster, iCur, i; 00172 vector<size_t> veciIndices; 00173 vector<float> vecdCopy; 00174 vector<CGenes*> vecpCopy; 00175 00176 while( !istm.eof( ) ) { 00177 istm.getline( szLine, c_iLine - 1 ); 00178 if( !strcmp( szLine, c_szBick ) ) 00179 continue; 00180 if( !strcmp( szLine, c_szBicd ) ) 00181 break; 00182 vecstrLine.clear( ); 00183 CMeta::Tokenize( szLine, vecstrLine ); 00184 if( vecstrLine.size( ) != 2 ) { 00185 cerr << "Illegal line: " << szLine; 00186 return 1; } 00187 vecdCopy.push_back( (float)atof( vecstrLine[ 1 ].c_str( ) ) ); } 00188 00189 iCluster = -1; 00190 while( !istm.eof( ) ) { 00191 istm.getline( szLine, c_iLine - 1 ); 00192 if( !szLine[ 0 ] ) 00193 break; 00194 vecstrLine.clear( ); 00195 CMeta::Tokenize( szLine, vecstrLine ); 00196 if( vecstrLine.size( ) != 3 ) { 00197 cerr << "Illegal line: " << szLine; 00198 return 1; } 00199 if( atoi( vecstrLine[ 1 ].c_str( ) ) != 1 ) 00200 continue; 00201 if( ( iCur = atoi( vecstrLine[ 0 ].c_str( ) ) ) != iCluster ) { 00202 if( vecstrGenes.size( ) ) { 00203 vecpCopy.push_back( pCluster = new CGenes( Genome ) ); 00204 pCluster->Open( vecstrGenes ); } 00205 vecstrGenes.clear( ); 00206 iCluster = iCur; } 00207 vecstrGenes.push_back( vecstrLine[ 2 ] ); } 00208 if( vecstrGenes.size( ) ) { 00209 vecpCopy.push_back( pCluster = new CGenes( Genome ) ); 00210 pCluster->Open( vecstrGenes ); } 00211 00212 veciIndices.resize( vecdCopy.size( ) ); 00213 for( i = 0; i < veciIndices.size( ); ++i ) 00214 veciIndices[ i ] = i; 00215 sort( veciIndices.begin( ), veciIndices.end( ), SSorter( vecdCopy ) ); 00216 for( i = 0; i < veciIndices.size( ); ++i ) { 00217 vecdWeights.push_back( vecdCopy[ veciIndices[ i ] ] ); 00218 vecpClusters.push_back( vecpCopy[ veciIndices[ i ] ] ); } 00219 00220 return 0; } 00221 00222 int open_list( istream& istm, CGenome& Genome, vector<float>& vecdWeights, vector<CGenes*>& vecpClusters ) { 00223 static const size_t c_iLine = 1024; 00224 char szLine[ c_iLine ]; 00225 vector<string> vecstrLine, vecstrGenes; 00226 CGenes* pCluster; 00227 size_t iCluster, iCur; 00228 00229 iCluster = -1; 00230 while( !istm.eof( ) ) { 00231 istm.getline( szLine, c_iLine - 1 ); 00232 if( !szLine[ 0 ] ) 00233 break; 00234 vecstrLine.clear( ); 00235 CMeta::Tokenize( szLine, vecstrLine ); 00236 if( vecstrLine.size( ) != 2 ) { 00237 cerr << "Illegal line: " << szLine; 00238 return 1; } 00239 if( ( iCur = atoi( vecstrLine[ 1 ].c_str( ) ) ) != iCluster ) { 00240 if( vecstrGenes.size( ) ) { 00241 vecdWeights.push_back( 1 ); 00242 vecpClusters.push_back( pCluster = new CGenes( Genome ) ); 00243 pCluster->Open( vecstrGenes ); } 00244 vecstrGenes.clear( ); 00245 iCluster = iCur; } 00246 vecstrGenes.push_back( vecstrLine[ 0 ] ); } 00247 if( vecstrGenes.size( ) ) { 00248 vecdWeights.push_back( 1 ); 00249 vecpClusters.push_back( pCluster = new CGenes( Genome ) ); 00250 pCluster->Open( vecstrGenes ); } 00251 00252 return 0; } 00253 00254 int open_param( istream& istm, CGenome& Genome, vector<float>& vecdWeights, vector<CGenes*>& vecpClusters ) { 00255 static const size_t c_iLine = 1024; 00256 char szLine[ c_iLine ]; 00257 vector<string> vecstrLine, vecstrGenes; 00258 CGenes* pCluster; 00259 size_t iCluster, iCur; 00260 float dParam; 00261 00262 while( !istm.eof( ) ) { 00263 istm.getline( szLine, c_iLine - 1 ); 00264 if( !szLine[ 0 ] ) 00265 break; 00266 vecstrLine.clear( ); 00267 CMeta::Tokenize( szLine, vecstrLine ); 00268 if( vecstrLine.size( ) != 2 ) { 00269 cerr << "Illegal line: " << szLine; 00270 return 1; } 00271 if( vecstrLine[ 0 ] == c_szParam ) { 00272 if( vecstrGenes.size( ) ) { 00273 vecdWeights.push_back( dParam ); 00274 vecpClusters.push_back( pCluster = new CGenes( Genome ) ); 00275 pCluster->Open( vecstrGenes ); } 00276 iCluster = -1; 00277 vecstrGenes.clear( ); 00278 dParam = (float)atof( vecstrLine[ 1 ].c_str( ) ); 00279 continue; } 00280 if( ( iCur = atoi( vecstrLine[ 1 ].c_str( ) ) ) != iCluster ) { 00281 if( vecstrGenes.size( ) ) { 00282 vecdWeights.push_back( dParam ); 00283 vecpClusters.push_back( pCluster = new CGenes( Genome ) ); 00284 pCluster->Open( vecstrGenes ); } 00285 iCluster = iCur; 00286 vecstrGenes.clear( ); } 00287 vecstrGenes.push_back( vecstrLine[ 0 ] ); } 00288 if( vecstrGenes.size( ) ) { 00289 vecdWeights.push_back( dParam ); 00290 vecpClusters.push_back( pCluster = new CGenes( Genome ) ); 00291 pCluster->Open( vecstrGenes ); } 00292 00293 return 0; } 00294 00295 int open_fuzzy( istream& istm, size_t iSkip, CDat& Dat ) { 00296 CPCL PCL; 00297 size_t i, j, k; 00298 float dCur, dMax; 00299 00300 if( !PCL.Open( istm, iSkip ) ) { 00301 cerr << "Could not open fuzzy PCL" << endl; 00302 return 1; } 00303 00304 Dat.Open( PCL.GetGeneNames( ) ); 00305 for( i = 0; i < Dat.GetGenes( ); ++i ) 00306 for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j ) { 00307 dMax = 0; 00308 for( k = 0; k < PCL.GetExperiments( ); ++k ) 00309 if( ( dCur = PCL.Get( i, k ) * PCL.Get( j, k ) ) > dMax ) 00310 dMax = dCur; 00311 Dat.Set( i, j, dMax ); } 00312 00313 return 0; }