Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 static const char c_acQTC[] = "qtc"; 00026 00027 int main(int iArgs, char** aszArgs) { 00028 size_t i, j; 00029 ifstream ifsm; 00030 CDat DatIn, DatOut; 00031 vector<uint16_t> vecsClusters; 00032 CPCL* pPCL; 00033 uint16_t sClusters; 00034 CPCL PCL, Ranks, Weights; 00035 gengetopt_args_info sArgs; 00036 IMeasure* pMeasure; 00037 CMeasurePearson Pearson; 00038 CMeasureEuclidean Euclidean; 00039 CMeasureKendallsTau KendallsTau; 00040 CMeasureKolmogorovSmirnov KolmSmir; 00041 CMeasureSpearman Spearman(true); 00042 CMeasureQuickPearson QuickPear; 00043 CMeasureNegate EuclideanNeg(&Euclidean, false); 00044 IMeasure* apMeasures[] = { &Pearson, &EuclideanNeg, &KendallsTau, 00045 &KolmSmir, &Spearman, &QuickPear, NULL }; 00046 00047 if (cmdline_parser(iArgs, aszArgs, &sArgs)) { 00048 cmdline_parser_print_help(); 00049 return 1; 00050 } 00051 CMeta Meta(sArgs.verbosity_arg, sArgs.random_arg); 00052 00053 pMeasure = NULL; 00054 for (i = 0; apMeasures[i]; ++i) 00055 if (!strcmp(apMeasures[i]->GetName(), sArgs.distance_arg)) { 00056 if ((pMeasure = apMeasures[i]) == &EuclideanNeg) 00057 sArgs.normalize_flag = true; 00058 break; 00059 } 00060 if (!pMeasure) { 00061 cmdline_parser_print_help(); 00062 return 1; 00063 } 00064 00065 CMeasureAutocorrelate Autocorrelate(pMeasure, false); 00066 if (sArgs.autocorrelate_flag) 00067 pMeasure = &Autocorrelate; 00068 00069 if (sArgs.input_arg) { 00070 ifsm.open(sArgs.input_arg); 00071 if (PCL.Open(ifsm, sArgs.skip_arg)) { 00072 ifsm.close(); 00073 cerr << "Opened PCL: " << sArgs.input_arg << endl; 00074 } else { 00075 ifsm.close(); 00076 if (!DatIn.Open(sArgs.input_arg)) { 00077 cerr << "Could not open input: " << sArgs.input_arg << endl; 00078 return 1; 00079 } 00080 cerr << "Opened DAB: " << sArgs.input_arg << endl; 00081 if (sArgs.pcl_arg) { 00082 ifsm.clear(); 00083 ifsm.open(sArgs.pcl_arg); 00084 } 00085 if (!PCL.Open(sArgs.pcl_arg ? ifsm : cin, sArgs.skip_arg)) { 00086 cerr << "Could not open PCL" << endl; 00087 return 1; 00088 } 00089 ifsm.close(); 00090 } 00091 } else if (PCL.Open(cin, sArgs.skip_arg)) 00092 cerr << "Opened PCL" << endl; 00093 else { 00094 cerr << "Could not open PCL" << endl; 00095 return 1; 00096 } 00097 00098 if (sArgs.weights_arg) { 00099 ifsm.clear(); 00100 ifsm.open(sArgs.weights_arg); 00101 if (!Weights.Open(ifsm, sArgs.skip_arg)) { 00102 cerr << "Could not open: " << sArgs.weights_arg << endl; 00103 return 1; 00104 } 00105 ifsm.close(); 00106 00107 if ((Weights.GetExperiments() != PCL.GetExperiments()) 00108 || (Weights.GetGenes() != PCL.GetGenes())) { 00109 cerr << "Illegal data sizes: " << PCL.GetExperiments() << 'x' 00110 << PCL.GetGenes() << ", " << Weights.GetExperiments() 00111 << 'x' << Weights.GetGenes() << endl; 00112 return 1; 00113 } 00114 } 00115 00116 if (pMeasure->IsRank()) { 00117 Ranks.Open(PCL); 00118 Ranks.RankTransform(); 00119 pPCL = &Ranks; 00120 } else 00121 pPCL = &PCL; 00122 00123 if (sArgs.delta_arg) { 00124 DatOut.Open(pPCL->GetGeneNames()); 00125 if (DatIn.GetGenes()) 00126 CClustQTC::Cluster(DatIn.Get(), (float) sArgs.diamineter_arg, 00127 (float) sArgs.diameter_arg, (float) sArgs.delta_arg, 00128 sArgs.size_arg, DatOut.Get()); 00129 else 00130 CClustQTC::Cluster(pPCL->Get(), pMeasure, 00131 (float) sArgs.diamineter_arg, (float) sArgs.diameter_arg, 00132 (float) sArgs.delta_arg, sArgs.size_arg, DatOut.Get(), 00133 sArgs.weights_arg ? &Weights.Get() : NULL ); 00134 DatOut.Save(sArgs.output_arg); 00135 } else { 00136 if (!strcmp(sArgs.algorithm_arg, c_acQTC)) 00137 sClusters = DatIn.GetGenes() ? CClustQTC::Cluster(DatIn.Get(), 00138 (float) sArgs.diameter_arg, sArgs.size_arg, vecsClusters) 00139 : CClustQTC::Cluster(pPCL->Get(), pMeasure, 00140 (float) sArgs.diameter_arg, sArgs.size_arg, 00141 vecsClusters, sArgs.weights_arg ? &Weights.Get() 00142 : NULL ); 00143 else 00144 sClusters 00145 = (DatIn.GetGenes() ? CClustKMeans::Cluster(DatIn.Get(), 00146 sArgs.size_arg, vecsClusters) 00147 : CClustKMeans::Cluster(pPCL->Get(), pMeasure, 00148 sArgs.size_arg, vecsClusters, 00149 sArgs.weights_arg ? &Weights.Get() : NULL )) ? sArgs.size_arg 00150 : 0; 00151 00152 if (!sClusters) { 00153 cerr << "No clusters found" << endl; 00154 return 1; 00155 } else if (sArgs.output_arg && ! sArgs.pcl_out_flag) { 00156 DatOut.Open(pPCL->GetGeneNames()); 00157 for (i = 0; i < vecsClusters.size(); ++i) { 00158 if ((vecsClusters[i] + 1) == sClusters) 00159 continue; 00160 for (j = (i + 1); j < vecsClusters.size(); ++j) { 00161 if ((vecsClusters[j] + 1) == sClusters) 00162 continue; 00163 DatOut.Set(i, j, 00164 (vecsClusters[i] == vecsClusters[j]) ? 1.0f : 0.0f); 00165 } 00166 } 00167 DatOut.Save(sArgs.output_arg); 00168 } else { 00169 ostream* posm; 00170 ofstream ofsm; 00171 if (sArgs.output_info_given) { 00172 ofsm.open(sArgs.output_info_arg); 00173 posm = &ofsm; 00174 } else { 00175 posm = &cout; 00176 } 00177 if (sArgs.summary_flag) { 00178 int* iCounts = new int[sClusters]; 00179 fill(iCounts, iCounts + sClusters, 0); 00180 float** dSums = new float*[sClusters]; 00181 float** dSqSums = new float*[sClusters]; 00182 size_t nExp = pPCL->GetExperiments(); 00183 for (i = 0; i < sClusters; i++) { 00184 dSums[i] = new float[sClusters]; 00185 fill(dSums[i], dSums[i] + sClusters, 0.0f); 00186 dSqSums[i] = new float[sClusters]; 00187 fill(dSqSums[i], dSqSums[i] + sClusters, 0.0f); 00188 } 00189 for (i = 0; i < vecsClusters.size(); ++i) { 00190 iCounts[vecsClusters[i]]++; 00191 for (j = 0; j < nExp; ++j) { 00192 dSums[vecsClusters[i]][j] += pPCL->Get(i, j); 00193 dSqSums[vecsClusters[i]][j] += (pPCL->Get(i, j)) 00194 * (pPCL->Get(i, j)); 00195 } 00196 } 00197 for (i = 0; i < sClusters; ++i) 00198 *posm << "Size of Cluster" << i << '\t' << iCounts[i] 00199 << endl; 00200 00201 for (i = 0; i < sClusters; ++i) { 00202 *posm << "Mean of Cluster" << i; 00203 for (j = 0; j < nExp; ++j) { 00204 dSums[i][j] /= iCounts[i]; 00205 *posm << '\t' << dSums[i][j]; 00206 dSqSums[i][j] = sqrt(dSqSums[i][j] / iCounts[i] 00207 - dSums[i][j] * dSums[i][j]); 00208 } 00209 *posm << endl; 00210 } 00211 for (i = 0; i < sClusters; ++i) { 00212 *posm << "SD of Cluster" << i; 00213 for (j = 0; j < nExp; ++j) { 00214 *posm << '\t' << dSqSums[i][j]; 00215 } 00216 *posm << endl; 00217 } 00218 } else if (1) { 00219 size_t iF = pPCL->AddFeature("Assignment"); 00220 std::stringstream ss; 00221 00222 for (i = 0; i < vecsClusters.size(); ++i) { 00223 ss.str(""); 00224 ss << vecsClusters[i]; 00225 pPCL->SetFeature(i, iF, ss.str()); 00226 } 00227 pPCL->Save(sArgs.output_arg); 00228 } else { 00229 for (i = 0; i < vecsClusters.size(); ++i) 00230 *posm << pPCL->GetGene(i) << '\t' << vecsClusters[i] 00231 << endl; 00232 } 00233 } 00234 } 00235 return 0; 00236 }