Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "seekquery.h" 00023 #include "seekreader.h" 00024 00025 00026 namespace Sleipnir { 00027 00028 CSeekQuery::CSeekQuery(){ 00029 crossValGenes = NULL; 00030 queryGenes.clear(); 00031 queryGenePresence.clear(); 00032 iNumFold = 0; 00033 iFoldSize = 0; 00034 } 00035 CSeekQuery::~CSeekQuery(){ 00036 Reset(); 00037 } 00038 00039 bool CSeekQuery::Reset(){ 00040 if(crossValGenes!=NULL){ 00041 delete[] crossValGenes; 00042 crossValGenes = NULL; 00043 } 00044 queryGenePresence.clear(); 00045 queryGenes.clear(); 00046 iNumFold = 0; 00047 iFoldSize = 0; 00048 } 00049 00050 bool CSeekQuery::InitializeQuery(const vector<char> &query){ 00051 Reset(); 00052 utype i; 00053 queryGenePresence.resize(query.size()); 00054 00055 for(i=0; i<query.size(); i++){ 00056 if(query[i]==1) queryGenes.push_back(i); 00057 queryGenes[i] = query[i]; 00058 } 00059 queryGenes.resize(queryGenes.size()); 00060 return true; 00061 } 00062 00063 bool CSeekQuery::InitializeQuery(const vector<utype> &query, 00064 const utype &iGenes){ 00065 Reset(); 00066 utype i; 00067 queryGenePresence.resize(iGenes); 00068 fill(queryGenePresence.begin(), queryGenePresence.end(), (char) 0); 00069 for(i=0; i<query.size(); i++){ 00070 queryGenes.push_back(query[i]); 00071 queryGenePresence[query[i]] = 1; 00072 } 00073 queryGenes.resize(queryGenes.size()); 00074 return true; 00075 } 00076 00077 utype CSeekQuery::GetNumFold() const{ 00078 return iNumFold; 00079 } 00080 00081 const vector<char>& CSeekQuery::GetQueryPresence() const{ 00082 return queryGenePresence; 00083 } 00084 00085 const vector<utype>& CSeekQuery::GetQuery() const{ 00086 return queryGenes; 00087 } 00088 00089 const vector<utype>& CSeekQuery::GetCVQuery(utype &i) const{ 00090 return crossValGenes[i]; 00091 } 00092 00093 bool CSeekQuery::CreateCVPartitions(const gsl_rng *rnd, 00094 const CSeekQuery::PartitionMode &p, const utype iFold){ 00095 //must have run initializequery beforehand 00096 if(p!=LEAVE_ONE_IN && p!=LEAVE_ONE_OUT && p!=CUSTOM_PARTITION){ 00097 cerr << "Error, unknown partition mode" << endl; 00098 return false; 00099 } 00100 qSize = queryGenes.size(); 00101 utype fold_size = 0; 00102 utype iFoldx = iFold; 00103 if(CSeekTools::IsNaN(iFold)){ 00104 if(p==LEAVE_ONE_IN){ 00105 iFoldx = qSize; 00106 fold_size = 1; 00107 }else if(p==LEAVE_ONE_OUT){ 00108 iFoldx = qSize; 00109 fold_size = qSize-1; 00110 }else{ 00111 cerr << "Error, must specify number of folds if \ 00112 CustomPartition mode" << endl; 00113 return false; 00114 } 00115 }else{ 00116 if(p==LEAVE_ONE_IN){ 00117 iFoldx = qSize; 00118 fold_size = 1; 00119 }else if(p==LEAVE_ONE_OUT){ 00120 iFoldx = qSize; 00121 fold_size = qSize - 1; 00122 }else{ //CUSTOM_PART 00123 /*if(p==LEAVE_ONE_IN || p==LEAVE_ONE_OUT){ 00124 cerr << "Error, specified number of folds, so this must NOT be \ 00125 LEAVE_ONE_OUT or LEAVE_ONE_IN" << endl; 00126 return false; 00127 }*/ 00128 if(qSize <= iFoldx){ 00129 fold_size = 1; 00130 iFoldx = qSize; 00131 }else{ 00132 fold_size = qSize / iFoldx; 00133 if(qSize % iFoldx > 0){ 00134 fold_size++; 00135 } 00136 } 00137 } 00138 } 00139 iNumFold = iFoldx; 00140 iFoldSize = fold_size; 00141 crossValGenes = new vector<utype>[iNumFold]; 00142 //printf("Fold size %d %d\n", iNumFold, iFoldSize); 00143 00144 utype i, j, k; 00145 utype *q_b = (utype*)malloc(qSize*sizeof(utype)); 00146 for(i=0; i<qSize; i++){ 00147 q_b[i] = queryGenes[i]; 00148 //printf("%d ", q_b[i]); 00149 } 00150 //printf("\n"); 00151 //getchar(); 00152 gsl_ran_shuffle(rnd, q_b, qSize, sizeof(utype)); 00153 00154 if(p==LEAVE_ONE_IN || p==CUSTOM_PARTITION){ 00155 k = 0; 00156 for(i=0; i<iNumFold; i++){ 00157 for(j=0; j<iFoldSize; j++){ 00158 if(k==qSize) continue; 00159 crossValGenes[i].push_back(q_b[k]); 00160 k++; 00161 } 00162 crossValGenes[i].resize(crossValGenes[i].size()); 00163 } 00164 }else if(p==LEAVE_ONE_OUT){ 00165 utype current_index = -1; 00166 for(i=0; i<iNumFold; i++){ 00167 for(j=0; j<iFoldSize; j++){ 00168 current_index = (i+j) % qSize; 00169 crossValGenes[i].push_back(q_b[current_index]); 00170 } 00171 crossValGenes[i].resize(crossValGenes[i].size()); 00172 } 00173 } 00174 00175 free(q_b); 00176 return true; 00177 } 00178 }