Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #ifndef DATABASEI_H 00023 #define DATABASEI_H 00024 00025 #include <fstream> 00026 #include <map> 00027 #include <vector> 00028 00029 //Qian added 00030 #include <stdio.h> 00031 00032 #include "compactmatrix.h" 00033 00034 namespace Sleipnir { 00035 00036 class CDatabaselet { 00037 public: 00038 enum ENibbles { 00039 ENibblesLow, 00040 ENibblesHigh, 00041 ENibblesBoth 00042 }; 00043 00044 CDatabaselet( bool ); 00045 ~CDatabaselet( ); 00046 00047 bool Open( const std::string&, const std::vector<std::string>&, uint32_t, uint32_t ); 00048 bool Open( const std::string& ); 00049 bool Open( const std::vector<CCompactFullMatrix>&, size_t, size_t, bool ); 00050 00051 bool OpenNoOverwrite(); 00052 00053 //directly write bytes to disk 00054 bool Write(char* data, const size_t& iSize, const size_t offset = 0); 00055 00056 bool OpenWrite( unsigned char, size_t, ENibbles, unsigned char* ); 00057 00058 /* Get pair by referring to memory cache (ie charImage) of the db file */ 00059 bool Get( size_t iOne, size_t iTwo, vector<unsigned char>& vecbData, unsigned char *charImage); 00060 bool Get(size_t offset, vector<unsigned char>& vecbData, unsigned char *charImage); 00061 00062 /* Get pair by seeking in db file */ 00063 bool Get( size_t, size_t, std::vector<unsigned char>& ) const; 00064 bool Get( size_t, std::vector<unsigned char>&, bool ) const; 00065 bool Get( size_t, const std::vector<size_t>&, std::vector<unsigned char>&, bool ) const; 00066 bool Get(size_t, vector<unsigned char>&); 00067 00068 bool Set(uint32_t&, uint32_t&, vector<string>&); 00069 00070 static bool Combine(std::vector<CDatabaselet*>& vecDatabaselet, 00071 std::string strOutDirectory, vector<string> &vecstrGenes, bool bSplit = true); 00072 00073 size_t GetGenes( ) const { 00074 00075 return m_vecstrGenes.size( ); } 00076 00077 const std::string& GetGene( size_t iGene ) const { 00078 static const std::string c_strEmpty = ""; 00079 00080 return ( m_vecstrGenes.empty( ) ? c_strEmpty : m_vecstrGenes[ iGene % m_vecstrGenes.size( ) ] ); } 00081 00082 void Write( size_t iOne, size_t iTwo, size_t iDataset, unsigned char bValue, bool fBoth = false ) { 00083 std::streamoff iOffset; 00084 00085 iOffset = (std::streamoff)GetOffset( iOne, iTwo, iDataset ); 00086 00087 if(m_useNibble){ 00088 if( !fBoth ) { 00089 unsigned char b; 00090 m_fstm.seekg( iOffset, ios_base::beg ); 00091 b = m_fstm.get( ); 00092 bValue = ( iDataset % 2 ) ? ( ( b & 0xF ) | ( bValue << 4 ) ) : 00093 ( ( b & 0xF0 ) | ( bValue & 0xF ) ); 00094 } 00095 } 00096 00097 m_fstm.seekp( iOffset, ios_base::beg); 00098 m_fstm.put( bValue ); 00099 } 00100 00101 size_t GetDatasets( ) const { 00102 00103 return m_iDatasets; } 00104 00105 void CloseFile(){ 00106 if(m_fstm.is_open()){ 00107 m_fstm.close(); 00108 } 00109 } 00110 00111 void SetFile(string std){ 00112 strFileName = std; 00113 } 00114 00115 string GetFile() const{ 00116 return strFileName; 00117 } 00118 00119 unsigned char* GetCharImage(){ 00120 size_t iImageSize = GetSizeGenes(); 00121 unsigned char *charImage = (unsigned char*)malloc(iImageSize*sizeof(unsigned char)); 00122 00123 // read databaselet into charImage 00124 if(m_fstm.is_open()){ 00125 m_fstm.seekg(m_iHeader, ios_base::beg); 00126 m_fstm.read((char*) charImage, iImageSize); 00127 }else{ 00128 cerr << "CDatabaselet is not open." << endl; 00129 free(charImage); 00130 return NULL; 00131 } 00132 00133 return charImage; 00134 } 00135 00136 size_t GetImageSize(){ 00137 return GetSizeGenes(); 00138 } 00139 00140 00141 private: 00142 size_t GetSizeGenes( ) const { 00143 return ( GetSizeGene( ) * m_vecstrGenes.size( ) ); } 00144 00145 00146 size_t GetOffsetDataset( size_t iDataset ) const { 00147 if(m_useNibble){ 00148 return (iDataset / 2); 00149 }else{ 00150 return iDataset; 00151 } 00152 } 00153 00154 size_t GetSizePair( ) const { 00155 00156 if(m_useNibble){ 00157 return (m_iDatasets + 1) / 2; 00158 }else{ 00159 return m_iDatasets; 00160 } 00161 00162 } 00163 00164 size_t GetSizeGene( ) const { 00165 00166 return ( GetSizePair( ) * m_iGenes ); } 00167 00168 size_t GetOffset( size_t iGene ) const { 00169 00170 return ( m_iHeader + ( GetSizeGene( ) * iGene ) ); } 00171 00172 size_t GetOffset( size_t iOne, size_t iTwo ) const { 00173 00174 return ( GetOffset( iOne ) + ( GetSizePair( ) * iTwo ) ); } 00175 00176 size_t GetOffset( size_t iOne, size_t iTwo, size_t iDataset ) const { 00177 00178 return ( GetOffset( iOne, iTwo ) + GetOffsetDataset( iDataset ) ); } 00179 00180 uint32_t m_iGenes; 00181 uint32_t m_iDatasets; 00182 std::vector<std::string> m_vecstrGenes; 00183 std::string strFileName; 00184 00185 mutable std::fstream m_fstm; 00186 uint32_t m_iHeader; 00187 00188 bool m_useNibble; 00189 mutable pthread_mutex_t* m_pmutx; 00190 }; 00191 00192 class CDatabaseImpl { 00193 protected: 00194 static const char c_acDAB[]; 00195 static const char c_acQDAB[]; 00196 static const char c_acExtension[]; 00197 00198 CDatabaseImpl(bool useNibble){ 00199 m_fMemmap = false; 00200 m_iBlockIn = -1; 00201 m_iBlockOut = -1; 00202 m_fBuffer = false; 00203 m_useNibble = useNibble; 00204 } 00205 00206 ~CDatabaseImpl( ) { 00207 00208 Clear( ); } 00209 00210 bool Open( const std::vector<std::string>&, const std::vector<std::string>&, const map<string, size_t>& ); 00211 bool Open( const std::string&, size_t, bool = false ); 00212 00213 void Clear( ) { 00214 size_t i; 00215 m_mapstriGenes.clear( ); 00216 for( i = 0; i < m_vecpDBs.size( ); ++i ) 00217 delete m_vecpDBs[ i ]; 00218 m_vecpDBs.clear( ); } 00219 00220 size_t GetGene( const std::string& strGene ) const { 00221 std::map<std::string, size_t>::const_iterator iterGene; 00222 00223 return ( ( ( iterGene = m_mapstriGenes.find( strGene ) ) == m_mapstriGenes.end( ) ) ? -1 : 00224 iterGene->second ); } 00225 00226 bool m_fMemmap; 00227 bool m_fBuffer; 00228 size_t m_iBlockIn; 00229 size_t m_iBlockOut; 00230 std::vector<CDatabaselet*> m_vecpDBs; 00231 std::map<std::string, size_t> m_mapstriGenes; 00232 /* defines whether the CDatabaselet is nibble type. If false, it is byte by default.*/ 00233 bool m_useNibble; 00234 }; 00235 00236 } 00237 00238 #endif // DATABASEI_H