GenomeSequenceHelpers.h

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef _GENOME_SEQUENCE_HELPERS_H
00019 #define _GENOME_SEQUENCE_HELPERS_H
00020 
00021 #if !defined(MD5_DIGEST_LENGTH)
00022 #define MD5_DIGEST_LENGTH 16
00023 #endif
00024 
00025 #include "MemoryMapArray.h"
00026 
00027 #include <stdint.h>
00028 
00029 
00030 //
00031 // ChromosomeInfo represents the per chromosome information
00032 // necessary to write out SAM/BAM records.  In addition, it
00033 // contains a single internal index used to point to the vector
00034 // offset where the chromosome bases start.
00035 //
00036 // This is mildly non-optimal for larger collections of chromosomes
00037 // or contigs - one use case described having millions of contigs,
00038 // in which case, this structure alone would take a gigabyte or more.
00039 //
00040 struct ChromosomeInfo
00041 {
00042     static const int  MAX_GENOME_INFO_STRING=128;
00043 
00044     void constructorClear()
00045     {
00046         memset(this,0, sizeof(*this));
00047     }
00048     void setChromosomeName(const char *n)
00049     {
00050         strncpy(name, n, sizeof(name)-1);
00051     }
00052     genomeIndex_t   start;                              // internal offset to combined genome vector
00053     genomeIndex_t   size;                               // SAM SQ:LN value
00054     char            md5[2*MD5_DIGEST_LENGTH + 1];       // 32 chars plus NUL, SAM SQ:M5 value
00055     char            name[MAX_GENOME_INFO_STRING];       // SAM SQ:SN value
00056     char            assemblyID[MAX_GENOME_INFO_STRING]; // SAM SQ:AS value
00057     char            uri[MAX_GENOME_INFO_STRING];        // SAM SQ:UR value
00058     char            species[MAX_GENOME_INFO_STRING];    // SAM SQ:SP value
00059 
00060     // handy setting methods:
00061     void setAssemblyID(const char *newID)
00062     {
00063         strncpy(assemblyID, newID, sizeof(assemblyID) - 1);
00064     }
00065     void setSpecies(const char *newSpecies)
00066     {
00067         strncpy(species, newSpecies, sizeof(newSpecies));
00068     }
00069     void setURI(const char *newURI)
00070     {
00071         strncpy(uri, newURI, sizeof(uri));
00072     }
00073 };
00074 
00075 class genomeSequenceMmapHeader : public MemoryMapArrayHeader
00076 {
00077     friend class GenomeSequence;
00078     friend std::ostream &operator << (std::ostream &, genomeSequenceMmapHeader &);
00079 private:
00080     uint32_t    _chromosomeCount;
00081     bool        _colorSpace;
00082 
00083     ChromosomeInfo  _chromosomes[0];
00084 
00085 public:
00086     //
00087     // getHeaderSize is special in that it must not access any
00088     // member variables, since it is called when the header has
00089     // not been created yet.
00090     //
00091     static size_t  getHeaderSize(int chromosomeCount)
00092     {
00093         return sizeof(genomeSequenceMmapHeader) + sizeof(ChromosomeInfo[1]) * chromosomeCount;
00094     }
00095     //
00096     // below methods return TRUE if it failed, false otherwise (primarily
00097     // a length check).
00098     //
00099 };
00100 
00101 std::ostream &operator << (std::ostream &stream, genomeSequenceMmapHeader &h);
00102 
00103 //
00104 // define the genomeSequence array type:
00105 //
00106 // NB the access/set routines use the encoded base values in the range
00107 // 0-15, not the corresponding base pair character.
00108 //
00109 inline uint8_t genomeSequenceAccess(void *base, genomeIndex_t index)
00110 {
00111     if ((index&1)==0)
00112     {
00113         return ((uint8_t *) base)[index>>1] & 0xf;
00114     }
00115     else
00116     {
00117         return (((uint8_t *) base)[index>>1] & 0xf0) >> 4;
00118     }
00119 };
00120 inline void genomeSequenceSet(void *base, genomeIndex_t index, uint8_t v)
00121 {
00122     if ((index&1)==0)
00123     {
00124         ((uint8_t *) base)[index>>1] = (((uint8_t *) base)[index>>1] & 0xf0) | v;
00125     }
00126     else
00127     {
00128         ((uint8_t *) base)[index>>1] = (((uint8_t *) base)[index>>1] & 0x0f) | v<<4;
00129     }
00130 }
00131 
00132 inline size_t mmapGenomeSequenceElementCount2Bytes(genomeIndex_t i)
00133 {
00134     return sizeof(uint8_t) * i / 2;
00135 }
00136 
00137 class PackedRead
00138 {
00139     void set(int index, int val)
00140     {
00141         packedBases[index>>1] =
00142             (packedBases[index>>1]             // original value
00143              & ~(7<<((index&0x01)<<2)))         // logical AND off the original value
00144             | ((val&0x0f)<<((index&0x1)<<2));  // logical OR in the new value
00145     }
00146 public:
00147     std::vector<uint8_t> packedBases;
00148     uint32_t    length;
00149     int size()
00150     {
00151         return length;
00152     }
00153     void clear()
00154     {
00155         packedBases.clear();
00156         length=0;
00157     }
00158     void set(const char *rhs, int padWithNCount = 0);
00159     uint8_t operator [](int index)
00160     {
00161         return (packedBases[index>>1] >> ((index&0x1)<<2)) & 0xf;
00162     }
00163 };
00164 
00165 #endif
Generated on Mon Feb 11 13:45:18 2013 for libStatGen Software by  doxygen 1.6.3