Sleipnir
src/databasei.h
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #ifndef DATABASEI_H
00023 #define DATABASEI_H
00024 
00025 #include <fstream>
00026 #include <map>
00027 #include <vector>
00028 
00029 //Qian added
00030 #include <stdio.h>
00031 
00032 #include "compactmatrix.h"
00033 
00034 namespace Sleipnir {
00035 
00036 class CDatabaselet {
00037 public:
00038     enum ENibbles {
00039         ENibblesLow,
00040         ENibblesHigh,
00041         ENibblesBoth
00042     };
00043 
00044     CDatabaselet( bool );
00045     ~CDatabaselet( );
00046 
00047     bool Open( const std::string&, const std::vector<std::string>&, uint32_t, uint32_t );
00048     bool Open( const std::string& );
00049     bool Open( const std::vector<CCompactFullMatrix>&, size_t, size_t, bool );
00050 
00051     bool OpenNoOverwrite();
00052 
00053     //directly write bytes to disk
00054     bool Write(char* data, const size_t& iSize, const size_t offset = 0);
00055 
00056     bool OpenWrite( unsigned char, size_t, ENibbles, unsigned char* );
00057 
00058     /* Get pair by referring to memory cache (ie charImage) of the db file */
00059     bool Get( size_t iOne, size_t iTwo, vector<unsigned char>& vecbData, unsigned char *charImage);
00060     bool Get(size_t offset, vector<unsigned char>& vecbData, unsigned char *charImage);
00061 
00062     /* Get pair by seeking in db file */
00063     bool Get( size_t, size_t, std::vector<unsigned char>& ) const;
00064     bool Get( size_t, std::vector<unsigned char>&, bool ) const;
00065     bool Get( size_t, const std::vector<size_t>&, std::vector<unsigned char>&, bool ) const;
00066     bool Get(size_t, vector<unsigned char>&);
00067 
00068     bool Set(uint32_t&, uint32_t&, vector<string>&);
00069 
00070     static bool Combine(std::vector<CDatabaselet*>& vecDatabaselet,
00071             std::string strOutDirectory, vector<string> &vecstrGenes, bool bSplit = true);
00072 
00073     size_t GetGenes( ) const {
00074 
00075         return m_vecstrGenes.size( ); }
00076 
00077     const std::string& GetGene( size_t iGene ) const {
00078         static const std::string    c_strEmpty  = "";
00079 
00080         return ( m_vecstrGenes.empty( ) ? c_strEmpty : m_vecstrGenes[ iGene % m_vecstrGenes.size( ) ] ); }
00081 
00082     void Write( size_t iOne, size_t iTwo, size_t iDataset, unsigned char bValue, bool fBoth = false ) {
00083         std::streamoff  iOffset;
00084 
00085         iOffset = (std::streamoff)GetOffset( iOne, iTwo, iDataset );
00086 
00087         if(m_useNibble){
00088             if( !fBoth ) {
00089                 unsigned char   b;
00090                 m_fstm.seekg( iOffset, ios_base::beg );
00091                 b = m_fstm.get( );
00092                 bValue = ( iDataset % 2 ) ? ( ( b & 0xF ) | ( bValue << 4 ) ) :
00093                         ( ( b & 0xF0 ) | ( bValue & 0xF ) ); 
00094                 }
00095         }
00096 
00097         m_fstm.seekp( iOffset, ios_base::beg);
00098         m_fstm.put( bValue );
00099     }
00100 
00101     size_t GetDatasets( ) const {
00102 
00103         return m_iDatasets; }
00104 
00105     void CloseFile(){
00106         if(m_fstm.is_open()){
00107             m_fstm.close();
00108         }
00109     }
00110 
00111     void SetFile(string std){
00112         strFileName = std;
00113     }
00114 
00115     string GetFile() const{
00116         return strFileName;
00117     }
00118 
00119     unsigned char* GetCharImage(){
00120         size_t iImageSize = GetSizeGenes();
00121         unsigned char *charImage = (unsigned char*)malloc(iImageSize*sizeof(unsigned char));
00122 
00123         // read databaselet into charImage
00124         if(m_fstm.is_open()){
00125             m_fstm.seekg(m_iHeader, ios_base::beg);
00126             m_fstm.read((char*) charImage, iImageSize);
00127         }else{
00128             cerr << "CDatabaselet is not open." << endl;
00129             free(charImage);
00130             return NULL;
00131         }
00132 
00133         return charImage;
00134     }
00135     
00136     size_t GetImageSize(){
00137         return GetSizeGenes();
00138     }
00139 
00140 
00141 private:
00142     size_t GetSizeGenes( ) const {
00143         return ( GetSizeGene( ) * m_vecstrGenes.size( ) ); }
00144 
00145 
00146     size_t GetOffsetDataset( size_t iDataset ) const {
00147         if(m_useNibble){
00148             return (iDataset / 2);
00149         }else{
00150             return iDataset;
00151         }
00152     }
00153 
00154     size_t GetSizePair( ) const {
00155 
00156         if(m_useNibble){
00157             return (m_iDatasets + 1) / 2;
00158         }else{
00159             return m_iDatasets;
00160         }
00161 
00162     }
00163 
00164     size_t GetSizeGene( ) const {
00165 
00166         return ( GetSizePair( ) * m_iGenes ); }
00167 
00168     size_t GetOffset( size_t iGene ) const {
00169 
00170         return ( m_iHeader + ( GetSizeGene( ) * iGene ) ); }
00171 
00172     size_t GetOffset( size_t iOne, size_t iTwo ) const {
00173 
00174         return ( GetOffset( iOne ) + ( GetSizePair( ) * iTwo ) ); }
00175 
00176     size_t GetOffset( size_t iOne, size_t iTwo, size_t iDataset ) const {
00177 
00178         return ( GetOffset( iOne, iTwo ) + GetOffsetDataset( iDataset ) ); }
00179 
00180     uint32_t                    m_iGenes;
00181     uint32_t                    m_iDatasets;
00182     std::vector<std::string>    m_vecstrGenes;
00183     std::string                 strFileName;
00184 
00185     mutable std::fstream        m_fstm;
00186     uint32_t                    m_iHeader;
00187 
00188     bool                        m_useNibble;
00189     mutable pthread_mutex_t*    m_pmutx;
00190 };
00191 
00192 class CDatabaseImpl {
00193 protected:
00194     static const char   c_acDAB[];
00195     static const char   c_acQDAB[];
00196     static const char   c_acExtension[];
00197 
00198     CDatabaseImpl(bool useNibble){
00199         m_fMemmap = false;
00200         m_iBlockIn = -1;
00201         m_iBlockOut = -1;
00202         m_fBuffer = false;
00203         m_useNibble = useNibble;
00204     }
00205 
00206     ~CDatabaseImpl( ) {
00207 
00208         Clear( ); }
00209 
00210     bool Open( const std::vector<std::string>&, const std::vector<std::string>&, const map<string, size_t>& );
00211     bool Open( const std::string&, size_t, bool = false );
00212 
00213     void Clear( ) {
00214         size_t  i;
00215         m_mapstriGenes.clear( );
00216         for( i = 0; i < m_vecpDBs.size( ); ++i )
00217             delete m_vecpDBs[ i ];
00218         m_vecpDBs.clear( ); }
00219 
00220     size_t GetGene( const std::string& strGene ) const {
00221         std::map<std::string, size_t>::const_iterator   iterGene;
00222 
00223         return ( ( ( iterGene = m_mapstriGenes.find( strGene ) ) == m_mapstriGenes.end( ) ) ? -1 :
00224             iterGene->second ); }
00225 
00226     bool                            m_fMemmap;
00227     bool                            m_fBuffer;
00228     size_t                          m_iBlockIn;
00229     size_t                          m_iBlockOut;
00230     std::vector<CDatabaselet*>      m_vecpDBs;
00231     std::map<std::string, size_t>   m_mapstriGenes;
00232     /* defines whether the CDatabaselet is nibble type. If false, it is byte by default.*/
00233     bool                            m_useNibble;
00234 };
00235 
00236 }
00237 
00238 #endif // DATABASEI_H