libStatGen Software  1
GenomeSequenceHelpers.h
00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef _GENOME_SEQUENCE_HELPERS_H
00019 #define _GENOME_SEQUENCE_HELPERS_H
00020 
00021 #if !defined(MD5_DIGEST_LENGTH)
00022 #define MD5_DIGEST_LENGTH 16
00023 #endif
00024 
00025 #include "MemoryMapArray.h"
00026 
00027 #include <stdint.h>
00028 
00029 
00030 //
00031 // ChromosomeInfo represents the per chromosome information
00032 // necessary to write out SAM/BAM records.  In addition, it
00033 // contains a single internal index used to point to the vector
00034 // offset where the chromosome bases start.
00035 //
00036 // This is mildly non-optimal for larger collections of chromosomes
00037 // or contigs - one use case described having millions of contigs,
00038 // in which case, this structure alone would take a gigabyte or more.
00039 //
00040 struct ChromosomeInfo
00041 {
00042     static const int  MAX_GENOME_INFO_STRING=128;
00043 
00044     void constructorClear()
00045     {
00046         memset(this,0, sizeof(*this));
00047     }
00048     void setChromosomeName(const char *n)
00049     {
00050         strncpy(name, n, sizeof(name)-1);
00051         name[sizeof(name)-1] = '\0';
00052     }
00053     genomeIndex_t   start;                              // internal offset to combined genome vector
00054     genomeIndex_t   size;                               // SAM SQ:LN value
00055     char            md5[2*MD5_DIGEST_LENGTH + 1];       // 32 chars plus NUL, SAM SQ:M5 value
00056     char            name[MAX_GENOME_INFO_STRING];       // SAM SQ:SN value
00057     char            assemblyID[MAX_GENOME_INFO_STRING]; // SAM SQ:AS value
00058     char            uri[MAX_GENOME_INFO_STRING];        // SAM SQ:UR value
00059     char            species[MAX_GENOME_INFO_STRING];    // SAM SQ:SP value
00060 
00061     // handy setting methods:
00062     void setAssemblyID(const char *newID)
00063     {
00064         strncpy(assemblyID, newID, sizeof(assemblyID)-1);
00065         name[sizeof(name)-1] = '\0';
00066     }
00067     void setSpecies(const char *newSpecies)
00068     {
00069         strncpy(species, newSpecies, sizeof(species)-1);
00070         species[sizeof(species)-1] = '\0';
00071     }
00072     void setURI(const char *newURI)
00073     {
00074         strncpy(uri, newURI, sizeof(uri)-1);
00075         uri[sizeof(uri)-1] = '\0';
00076     }
00077 };
00078 
00079 class genomeSequenceMmapHeader : public MemoryMapArrayHeader
00080 {
00081     friend class GenomeSequence;
00082     friend std::ostream &operator << (std::ostream &, genomeSequenceMmapHeader &);
00083 private:
00084     uint32_t    _chromosomeCount;
00085     bool        _colorSpace;
00086 
00087     ChromosomeInfo  _chromosomes[0];
00088 
00089 public:
00090     //
00091     // getHeaderSize is special in that it must not access any
00092     // member variables, since it is called when the header has
00093     // not been created yet.
00094     //
00095     static size_t  getHeaderSize(int chromosomeCount)
00096     {
00097         return sizeof(genomeSequenceMmapHeader) + sizeof(ChromosomeInfo[1]) * chromosomeCount;
00098     }
00099     //
00100     // below methods return TRUE if it failed, false otherwise (primarily
00101     // a length check).
00102     //
00103 };
00104 
00105 std::ostream &operator << (std::ostream &stream, genomeSequenceMmapHeader &h);
00106 
00107 //
00108 // define the genomeSequence array type:
00109 //
00110 // NB the access/set routines use the encoded base values in the range
00111 // 0-15, not the corresponding base pair character.
00112 //
00113 inline uint8_t genomeSequenceAccess(void *base, genomeIndex_t index)
00114 {
00115     if ((index&1)==0)
00116     {
00117         return ((uint8_t *) base)[index>>1] & 0xf;
00118     }
00119     else
00120     {
00121         return (((uint8_t *) base)[index>>1] & 0xf0) >> 4;
00122     }
00123 };
00124 inline void genomeSequenceSet(void *base, genomeIndex_t index, uint8_t v)
00125 {
00126     if ((index&1)==0)
00127     {
00128         ((uint8_t *) base)[index>>1] = (((uint8_t *) base)[index>>1] & 0xf0) | v;
00129     }
00130     else
00131     {
00132         ((uint8_t *) base)[index>>1] = (((uint8_t *) base)[index>>1] & 0x0f) | v<<4;
00133     }
00134 }
00135 
00136 inline size_t mmapGenomeSequenceElementCount2Bytes(genomeIndex_t i)
00137 {
00138     return sizeof(uint8_t) * i / 2;
00139 }
00140 
00141 class PackedRead
00142 {
00143     void set(int index, int val)
00144     {
00145         packedBases[index>>1] =
00146             (packedBases[index>>1]             // original value
00147              & ~(7<<((index&0x01)<<2)))         // logical AND off the original value
00148             | ((val&0x0f)<<((index&0x1)<<2));  // logical OR in the new value
00149     }
00150 public:
00151     std::vector<uint8_t> packedBases;
00152     uint32_t    length;
00153     int size()
00154     {
00155         return length;
00156     }
00157     void clear()
00158     {
00159         packedBases.clear();
00160         length=0;
00161     }
00162     void set(const char *rhs, int padWithNCount = 0);
00163     uint8_t operator [](int index)
00164     {
00165         return (packedBases[index>>1] >> ((index&0x1)<<2)) & 0xf;
00166     }
00167 };
00168 
00169 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends