libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #ifndef _GENOME_SEQUENCE_HELPERS_H 00019 #define _GENOME_SEQUENCE_HELPERS_H 00020 00021 #if !defined(MD5_DIGEST_LENGTH) 00022 #define MD5_DIGEST_LENGTH 16 00023 #endif 00024 00025 #include "MemoryMapArray.h" 00026 00027 #include <stdint.h> 00028 00029 00030 // 00031 // ChromosomeInfo represents the per chromosome information 00032 // necessary to write out SAM/BAM records. In addition, it 00033 // contains a single internal index used to point to the vector 00034 // offset where the chromosome bases start. 00035 // 00036 // This is mildly non-optimal for larger collections of chromosomes 00037 // or contigs - one use case described having millions of contigs, 00038 // in which case, this structure alone would take a gigabyte or more. 00039 // 00040 struct ChromosomeInfo 00041 { 00042 static const int MAX_GENOME_INFO_STRING=128; 00043 00044 void constructorClear() 00045 { 00046 memset(this,0, sizeof(*this)); 00047 } 00048 void setChromosomeName(const char *n) 00049 { 00050 strncpy(name, n, sizeof(name)-1); 00051 name[sizeof(name)-1] = '\0'; 00052 } 00053 genomeIndex_t start; // internal offset to combined genome vector 00054 genomeIndex_t size; // SAM SQ:LN value 00055 char md5[2*MD5_DIGEST_LENGTH + 1]; // 32 chars plus NUL, SAM SQ:M5 value 00056 char name[MAX_GENOME_INFO_STRING]; // SAM SQ:SN value 00057 char assemblyID[MAX_GENOME_INFO_STRING]; // SAM SQ:AS value 00058 char uri[MAX_GENOME_INFO_STRING]; // SAM SQ:UR value 00059 char species[MAX_GENOME_INFO_STRING]; // SAM SQ:SP value 00060 00061 // handy setting methods: 00062 void setAssemblyID(const char *newID) 00063 { 00064 strncpy(assemblyID, newID, sizeof(assemblyID)-1); 00065 name[sizeof(name)-1] = '\0'; 00066 } 00067 void setSpecies(const char *newSpecies) 00068 { 00069 strncpy(species, newSpecies, sizeof(species)-1); 00070 species[sizeof(species)-1] = '\0'; 00071 } 00072 void setURI(const char *newURI) 00073 { 00074 strncpy(uri, newURI, sizeof(uri)-1); 00075 uri[sizeof(uri)-1] = '\0'; 00076 } 00077 }; 00078 00079 class genomeSequenceMmapHeader : public MemoryMapArrayHeader 00080 { 00081 friend class GenomeSequence; 00082 friend std::ostream &operator << (std::ostream &, genomeSequenceMmapHeader &); 00083 private: 00084 uint32_t _chromosomeCount; 00085 bool _colorSpace; 00086 00087 ChromosomeInfo _chromosomes[0]; 00088 00089 public: 00090 // 00091 // getHeaderSize is special in that it must not access any 00092 // member variables, since it is called when the header has 00093 // not been created yet. 00094 // 00095 static size_t getHeaderSize(int chromosomeCount) 00096 { 00097 return sizeof(genomeSequenceMmapHeader) + sizeof(ChromosomeInfo[1]) * chromosomeCount; 00098 } 00099 // 00100 // below methods return TRUE if it failed, false otherwise (primarily 00101 // a length check). 00102 // 00103 }; 00104 00105 std::ostream &operator << (std::ostream &stream, genomeSequenceMmapHeader &h); 00106 00107 // 00108 // define the genomeSequence array type: 00109 // 00110 // NB the access/set routines use the encoded base values in the range 00111 // 0-15, not the corresponding base pair character. 00112 // 00113 inline uint8_t genomeSequenceAccess(void *base, genomeIndex_t index) 00114 { 00115 if ((index&1)==0) 00116 { 00117 return ((uint8_t *) base)[index>>1] & 0xf; 00118 } 00119 else 00120 { 00121 return (((uint8_t *) base)[index>>1] & 0xf0) >> 4; 00122 } 00123 }; 00124 inline void genomeSequenceSet(void *base, genomeIndex_t index, uint8_t v) 00125 { 00126 if ((index&1)==0) 00127 { 00128 ((uint8_t *) base)[index>>1] = (((uint8_t *) base)[index>>1] & 0xf0) | v; 00129 } 00130 else 00131 { 00132 ((uint8_t *) base)[index>>1] = (((uint8_t *) base)[index>>1] & 0x0f) | v<<4; 00133 } 00134 } 00135 00136 inline size_t mmapGenomeSequenceElementCount2Bytes(genomeIndex_t i) 00137 { 00138 return sizeof(uint8_t) * i / 2; 00139 } 00140 00141 class PackedRead 00142 { 00143 void set(int index, int val) 00144 { 00145 packedBases[index>>1] = 00146 (packedBases[index>>1] // original value 00147 & ~(7<<((index&0x01)<<2))) // logical AND off the original value 00148 | ((val&0x0f)<<((index&0x1)<<2)); // logical OR in the new value 00149 } 00150 public: 00151 std::vector<uint8_t> packedBases; 00152 uint32_t length; 00153 int size() 00154 { 00155 return length; 00156 } 00157 void clear() 00158 { 00159 packedBases.clear(); 00160 length=0; 00161 } 00162 void set(const char *rhs, int padWithNCount = 0); 00163 uint8_t operator [](int index) 00164 { 00165 return (packedBases[index>>1] >> ((index&0x1)<<2)) & 0xf; 00166 } 00167 }; 00168 00169 #endif