00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #ifndef _PILEUP_READER_H 00019 #define _PILEUP_READER_H 00020 00021 #include <list> 00022 #include <stdint.h> 00023 #include <string> 00024 #include <queue> 00025 00026 #include "SamFile.h" 00027 00028 /// 00029 /// \file this file implements a means of reading the 00030 /// bases and structural variations of interest that are 00031 /// present in any BAM file. 00032 /// 00033 /// Two data structures contain the pertinent information: 00034 /// 00035 /// - ReadBase contains information about a single base in 00036 /// a single sequence, and is a copy of the useful fields in 00037 /// the BAM record that it originated from. 00038 /// 00039 /// - ReadInsertion contains the information that can't 00040 /// be represented as a base itself, which is material 00041 /// inserted at the genome at a given location. 00042 /// 00043 /// The class ReadBase represents either bases that are directly 00044 /// present in the class or it represents deletions of those 00045 /// bases. So for example, if you ask for chromosome 1, position 00046 /// 25 and the corresponding aligned read indicates in the CIGAR 00047 /// string that there is a deletion there, the ReadBase object 00048 /// will not show a valid base, but rather indicate that this 00049 /// base and some number of subsequent bases were deleted. 00050 /// 00051 /// Insertions cannot be represented directly as a single added 00052 /// base nor as a set of bases that were deleted (which ReadBase 00053 /// does) but rather a set of bases that appears in between two 00054 /// positions on the reference. 00055 /// 00056 /// The PileupReader class provides the mechanism for attaching 00057 /// to a BAM file and retrieving the above data. 00058 /// 00059 /// Use cases: 00060 /// 00061 /// - read the BAM data associated with specific known marker 00062 /// positions, usually those read locations that are provided 00063 /// by a sequencing chip. In this case, PileupReader will by 00064 /// default use an index if present to rapidly skip from marker 00065 /// to marker. This behavior may be controlled by using the 00066 /// useIndex() method. 00067 /// 00068 /// - scan the entire genome, base by base, reading bases and 00069 /// insertions and evaluating them, for example to do novel 00070 /// SNP detection. 00071 /// 00072 00073 /// 00074 /// When we call PileupReader::getPileup, we return a set 00075 /// of these objects representing the bases found in all 00076 /// overlapping reads in the given BAM file. 00077 /// 00078 /// The information in this class is the pertinent information 00079 /// from the SAM format record. 00080 /// 00081 class ReadBase { 00082 SamRecord *_samRecord; 00083 int _deletedBaseCount; // how many bases, including this one, are deleted 00084 int _insertedBasesOverlapping; 00085 bool _InsertionFollowing; // THIS read contains an insertion immediately after this base 00086 00087 // More private state is needed - e.g. offset and length into read. 00088 // But also cigar string parsing state information - probably need 00089 // a small common state class used here and in ReadInsertion 00090 public: 00091 ReadBase() : _deletedBaseCount(0), _insertedBasesOverlapping(0), _InsertionFollowing(false) {;} 00092 00093 bool set(SamRecord &, int basePosition); 00094 00095 char getBase(); 00096 char getPhredQualityChar(); 00097 int getPhredQualityScore(); 00098 00099 int getDeletedBaseCount(); 00100 int getInsertedBasesOverlapping(); 00101 00102 bool hasInsertionFollowing(); 00103 }; 00104 00105 /// 00106 /// Contain all pertinent information from overlapping 00107 /// SAM records that START an insertion at this point. 00108 /// 00109 class ReadInsertion { 00110 SamRecord *_samRecord; 00111 // More private state is needed - e.g. offset and length into read. 00112 // But also cigar string parsing state information - probably need 00113 // a small common state class used here and in ReadBase 00114 public: 00115 bool set(SamRecord &r); 00116 00117 /// STL single copy access: 00118 bool getQualities(std::string &phredQualities); 00119 bool getBases(std::string &bases); 00120 00121 /// zero copy access: 00122 int getQualities(char *phredQualities, int maxLength); 00123 int getBases(char *bases, int maxLength); 00124 00125 uint8_t getMapQuality(); 00126 uint16_t getSAMFlag(); 00127 SamRecord & getSamRecord(); 00128 }; 00129 00130 /// 00131 /// class for tracking BAM file sequence data that covers markers of interest. 00132 /// 00133 /// Given a sorted BAM file, allow us to scan over the file, either base by 00134 /// base or marker by marker in ascending order, and obtain a list of 00135 /// bases from the BAM file at the corresponding position. 00136 /// 00137 /// In the event that an insertion is noticed after the marker of interest, 00138 /// signal that in getPileup, and allow the data to be returned by calling 00139 /// getInsertions(). 00140 /// 00141 /// 00142 class SequenceCoverageReader { 00143 00144 private: 00145 SamFile& _samFile; 00146 int _referenceID; 00147 uint32_t _position; 00148 bool _useIndex; 00149 std::list<SamRecord *> _samRecords; // list of ptrs to records that overlap the last read pileup position 00150 std::queue<SamRecord *> _unused; // fifo to recliam pre-allocated records from 00151 00152 public: 00153 00154 SequenceCoverageReader(SamFile &samFile) : 00155 _samFile(samFile), 00156 _referenceID(0), 00157 _position(0), 00158 _useIndex(true) 00159 { ; } 00160 00161 void useIndex(bool u=true) { _useIndex = u; } 00162 00163 /// \brief getBases returns a pileup vector of the same 00164 /// length as the number of reads that overlaps the given 00165 /// chromosome and position 00166 /// 00167 /// \param referenceID identifies the chromosome in which 00168 /// to retrieve bases from. 00169 /// 00170 /// \param position indicates the 1-based offset of the 00171 /// base of interest on that chromosome. This number is 00172 /// relative to the genome reference that the BAM file 00173 /// was aligned against. 00174 /// 00175 /// \param readBases a caller allocated vector of of readBase 00176 /// objects 00177 /// 00178 /// \param basesInserted bool that indicates if getInsertions() will 00179 /// return a non-zero vector of bases that were inserted AFTER 00180 /// this position. Since reads may truncate this insertion, 00181 /// they may be different lengths. 00182 /// 00183 /// \return true if more data exists in the BAM file, false otherwise 00184 /// 00185 bool getBases(int32_t referenceID, uint32_t position, std::vector<ReadBase> &readBases, bool &basesInserted); 00186 00187 bool getInsertedBases(int referenceID, uint32_t position, std::vector<ReadInsertion> &readInsertions); 00188 00189 }; 00190 00191 00192 #endif