Sleipnir
tools/Clusterer/Clusterer.cpp
00001 /*****************************************************************************
00002  * This file is provided under the Creative Commons Attribution 3.0 license.
00003  *
00004  * You are free to share, copy, distribute, transmit, or adapt this work
00005  * PROVIDED THAT you attribute the work to the authors listed below.
00006  * For more information, please see the following web page:
00007  * http://creativecommons.org/licenses/by/3.0/
00008  *
00009  * This file is a component of the Sleipnir library for functional genomics,
00010  * authored by:
00011  * Curtis Huttenhower (chuttenh@princeton.edu)
00012  * Mark Schroeder
00013  * Maria D. Chikina
00014  * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015  *
00016  * If you use this library, the included executable tools, or any related
00017  * code in your work, please cite the following publication:
00018  * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019  * Olga G. Troyanskaya.
00020  * "The Sleipnir library for computational functional genomics"
00021  *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 static const char c_acQTC[] = "qtc";
00026 
00027 int main(int iArgs, char** aszArgs) {
00028     size_t i, j;
00029     ifstream ifsm;
00030     CDat DatIn, DatOut;
00031     vector<uint16_t> vecsClusters;
00032     CPCL* pPCL;
00033     uint16_t sClusters;
00034     CPCL PCL, Ranks, Weights;
00035     gengetopt_args_info sArgs;
00036     IMeasure* pMeasure;
00037     CMeasurePearson Pearson;
00038     CMeasureEuclidean Euclidean;
00039     CMeasureKendallsTau KendallsTau;
00040     CMeasureKolmogorovSmirnov KolmSmir;
00041     CMeasureSpearman Spearman(true);
00042     CMeasureQuickPearson QuickPear;
00043     CMeasureNegate EuclideanNeg(&Euclidean, false);
00044     IMeasure* apMeasures[] = { &Pearson, &EuclideanNeg, &KendallsTau,
00045             &KolmSmir, &Spearman, &QuickPear, NULL };
00046 
00047     if (cmdline_parser(iArgs, aszArgs, &sArgs)) {
00048         cmdline_parser_print_help();
00049         return 1;
00050     }
00051     CMeta Meta(sArgs.verbosity_arg, sArgs.random_arg);
00052 
00053     pMeasure = NULL;
00054     for (i = 0; apMeasures[i]; ++i)
00055         if (!strcmp(apMeasures[i]->GetName(), sArgs.distance_arg)) {
00056             if ((pMeasure = apMeasures[i]) == &EuclideanNeg)
00057                 sArgs.normalize_flag = true;
00058             break;
00059         }
00060     if (!pMeasure) {
00061         cmdline_parser_print_help();
00062         return 1;
00063     }
00064 
00065     CMeasureAutocorrelate Autocorrelate(pMeasure, false);
00066     if (sArgs.autocorrelate_flag)
00067         pMeasure = &Autocorrelate;
00068 
00069     if (sArgs.input_arg) {
00070         ifsm.open(sArgs.input_arg);
00071         if (PCL.Open(ifsm, sArgs.skip_arg)) {
00072             ifsm.close();
00073             cerr << "Opened PCL: " << sArgs.input_arg << endl;
00074         } else {
00075             ifsm.close();
00076             if (!DatIn.Open(sArgs.input_arg)) {
00077                 cerr << "Could not open input: " << sArgs.input_arg << endl;
00078                 return 1;
00079             }
00080             cerr << "Opened DAB: " << sArgs.input_arg << endl;
00081             if (sArgs.pcl_arg) {
00082                 ifsm.clear();
00083                 ifsm.open(sArgs.pcl_arg);
00084             }
00085             if (!PCL.Open(sArgs.pcl_arg ? ifsm : cin, sArgs.skip_arg)) {
00086                 cerr << "Could not open PCL" << endl;
00087                 return 1;
00088             }
00089             ifsm.close();
00090         }
00091     } else if (PCL.Open(cin, sArgs.skip_arg))
00092         cerr << "Opened PCL" << endl;
00093     else {
00094         cerr << "Could not open PCL" << endl;
00095         return 1;
00096     }
00097 
00098     if (sArgs.weights_arg) {
00099         ifsm.clear();
00100         ifsm.open(sArgs.weights_arg);
00101         if (!Weights.Open(ifsm, sArgs.skip_arg)) {
00102             cerr << "Could not open: " << sArgs.weights_arg << endl;
00103             return 1;
00104         }
00105         ifsm.close();
00106 
00107         if ((Weights.GetExperiments() != PCL.GetExperiments())
00108                 || (Weights.GetGenes() != PCL.GetGenes())) {
00109             cerr << "Illegal data sizes: " << PCL.GetExperiments() << 'x'
00110                     << PCL.GetGenes() << ", " << Weights.GetExperiments()
00111                     << 'x' << Weights.GetGenes() << endl;
00112             return 1;
00113         }
00114     }
00115 
00116     if (pMeasure->IsRank()) {
00117         Ranks.Open(PCL);
00118         Ranks.RankTransform();
00119         pPCL = &Ranks;
00120     } else
00121         pPCL = &PCL;
00122 
00123     if (sArgs.delta_arg) {
00124         DatOut.Open(pPCL->GetGeneNames());
00125         if (DatIn.GetGenes())
00126             CClustQTC::Cluster(DatIn.Get(), (float) sArgs.diamineter_arg,
00127                     (float) sArgs.diameter_arg, (float) sArgs.delta_arg,
00128                     sArgs.size_arg, DatOut.Get());
00129         else
00130             CClustQTC::Cluster(pPCL->Get(), pMeasure,
00131                     (float) sArgs.diamineter_arg, (float) sArgs.diameter_arg,
00132                     (float) sArgs.delta_arg, sArgs.size_arg, DatOut.Get(),
00133                     sArgs.weights_arg ? &Weights.Get() : NULL );
00134         DatOut.Save(sArgs.output_arg);
00135     } else {
00136         if (!strcmp(sArgs.algorithm_arg, c_acQTC))
00137             sClusters = DatIn.GetGenes() ? CClustQTC::Cluster(DatIn.Get(),
00138                     (float) sArgs.diameter_arg, sArgs.size_arg, vecsClusters)
00139                     : CClustQTC::Cluster(pPCL->Get(), pMeasure,
00140                             (float) sArgs.diameter_arg, sArgs.size_arg,
00141                             vecsClusters, sArgs.weights_arg ? &Weights.Get()
00142                                     : NULL );
00143         else
00144             sClusters
00145                     = (DatIn.GetGenes() ? CClustKMeans::Cluster(DatIn.Get(),
00146                             sArgs.size_arg, vecsClusters)
00147                             : CClustKMeans::Cluster(pPCL->Get(), pMeasure,
00148                                     sArgs.size_arg, vecsClusters,
00149                                     sArgs.weights_arg ? &Weights.Get() : NULL )) ? sArgs.size_arg
00150                             : 0;
00151 
00152         if (!sClusters) {
00153             cerr << "No clusters found" << endl;
00154             return 1;
00155         } else if (sArgs.output_arg && ! sArgs.pcl_out_flag) {
00156             DatOut.Open(pPCL->GetGeneNames());
00157             for (i = 0; i < vecsClusters.size(); ++i) {
00158                 if ((vecsClusters[i] + 1) == sClusters)
00159                     continue;
00160                 for (j = (i + 1); j < vecsClusters.size(); ++j) {
00161                     if ((vecsClusters[j] + 1) == sClusters)
00162                         continue;
00163                     DatOut.Set(i, j,
00164                             (vecsClusters[i] == vecsClusters[j]) ? 1.0f : 0.0f);
00165                 }
00166             }
00167             DatOut.Save(sArgs.output_arg);
00168         } else {
00169             ostream* posm;
00170             ofstream ofsm;
00171             if (sArgs.output_info_given) {
00172                 ofsm.open(sArgs.output_info_arg);
00173                 posm = &ofsm;
00174             } else {
00175                 posm = &cout;
00176             }
00177             if (sArgs.summary_flag) {
00178                 int* iCounts = new int[sClusters];
00179                 fill(iCounts, iCounts + sClusters, 0);
00180                 float** dSums = new float*[sClusters];
00181                 float** dSqSums = new float*[sClusters];
00182                 size_t nExp = pPCL->GetExperiments();
00183                 for (i = 0; i < sClusters; i++) {
00184                     dSums[i] = new float[sClusters];
00185                     fill(dSums[i], dSums[i] + sClusters, 0.0f);
00186                     dSqSums[i] = new float[sClusters];
00187                     fill(dSqSums[i], dSqSums[i] + sClusters, 0.0f);
00188                 }
00189                 for (i = 0; i < vecsClusters.size(); ++i) {
00190                     iCounts[vecsClusters[i]]++;
00191                     for (j = 0; j < nExp; ++j) {
00192                         dSums[vecsClusters[i]][j] += pPCL->Get(i, j);
00193                         dSqSums[vecsClusters[i]][j] += (pPCL->Get(i, j))
00194                                 * (pPCL->Get(i, j));
00195                     }
00196                 }
00197                 for (i = 0; i < sClusters; ++i)
00198                     *posm << "Size of Cluster" << i << '\t' << iCounts[i]
00199                             << endl;
00200 
00201                 for (i = 0; i < sClusters; ++i) {
00202                     *posm << "Mean of Cluster" << i;
00203                     for (j = 0; j < nExp; ++j) {
00204                         dSums[i][j] /= iCounts[i];
00205                         *posm << '\t' << dSums[i][j];
00206                         dSqSums[i][j] = sqrt(dSqSums[i][j] / iCounts[i]
00207                                 - dSums[i][j] * dSums[i][j]);
00208                     }
00209                     *posm << endl;
00210                 }
00211                 for (i = 0; i < sClusters; ++i) {
00212                     *posm << "SD of Cluster" << i;
00213                     for (j = 0; j < nExp; ++j) {
00214                         *posm << '\t' << dSqSums[i][j];
00215                         }
00216                     *posm << endl;
00217                 }
00218             } else if (1) {
00219                 size_t iF = pPCL->AddFeature("Assignment");
00220                 std::stringstream ss;
00221 
00222                 for (i = 0; i < vecsClusters.size(); ++i) {
00223                     ss.str("");
00224                     ss << vecsClusters[i];
00225                     pPCL->SetFeature(i, iF, ss.str());
00226                 }
00227                 pPCL->Save(sArgs.output_arg);
00228             } else {
00229                 for (i = 0; i < vecsClusters.size(); ++i)
00230                     *posm << pPCL->GetGene(i) << '\t' << vecsClusters[i]
00231                             << endl;
00232             }
00233         }
00234     }
00235     return 0;
00236 }