PileupReader.h

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef _PILEUP_READER_H
00019 #define _PILEUP_READER_H
00020 
00021 #include <list>
00022 #include <stdint.h>
00023 #include <string>
00024 #include <queue>
00025 
00026 #include "SamFile.h"
00027 
00028 ///
00029 /// \file this file implements a means of reading the
00030 /// bases and structural variations of interest that are
00031 /// present in any BAM file.
00032 /// 
00033 /// Two data structures contain the pertinent information:
00034 ///
00035 /// - ReadBase contains information about a single base in
00036 ///   a single sequence, and is a copy of the useful fields in
00037 ///   the BAM record that it originated from.
00038 ///
00039 /// - ReadInsertion contains the information that can't
00040 ///   be represented as a base itself, which is material
00041 ///   inserted at the genome at a given location.
00042 ///
00043 /// The class ReadBase represents either bases that are directly
00044 /// present in the class or it represents deletions of those
00045 /// bases.  So for example, if you ask for chromosome 1, position
00046 /// 25 and the corresponding aligned read indicates in the CIGAR
00047 /// string that there is a deletion there, the ReadBase object
00048 /// will not show a valid base, but rather indicate that this
00049 /// base and some number of subsequent bases were deleted.
00050 ///
00051 /// Insertions cannot be represented directly as a single added
00052 /// base nor as a set of bases that were deleted (which ReadBase
00053 /// does) but rather a set of bases that appears in between two
00054 /// positions on the reference.
00055 ///
00056 /// The PileupReader class provides the mechanism for attaching
00057 /// to a BAM file and retrieving the above data.
00058 ///
00059 /// Use cases:
00060 ///
00061 /// - read the BAM data associated with specific known marker
00062 ///   positions, usually those read locations that are provided
00063 ///   by a sequencing chip.  In this case, PileupReader will by
00064 ///   default use an index if present to rapidly skip from marker
00065 ///   to marker.  This behavior may be controlled by using the
00066 ///   useIndex() method.
00067 ///
00068 /// - scan the entire genome, base by base, reading bases and
00069 ///   insertions and evaluating them, for example to do novel
00070 ///   SNP detection.
00071 /// 
00072 
00073 ///
00074 /// When we call PileupReader::getPileup, we return a set
00075 /// of these objects representing the bases found in all
00076 /// overlapping reads in the given BAM file.
00077 ///
00078 /// The information in this class is the pertinent information
00079 /// from the SAM format record.
00080 ///
00081 class ReadBase {
00082     SamRecord   *_samRecord;
00083     int         _deletedBaseCount;   // how many bases, including this one, are deleted
00084     int         _insertedBasesOverlapping;
00085     bool        _InsertionFollowing;        // THIS read contains an insertion immediately after this base
00086 
00087     // More private state is needed - e.g. offset and length into read.
00088     // But also cigar string parsing state information - probably need
00089     // a small common state class used here and in ReadInsertion
00090 public:
00091     ReadBase() : _deletedBaseCount(0), _insertedBasesOverlapping(0), _InsertionFollowing(false) {;}
00092 
00093     bool set(SamRecord &, int basePosition);
00094 
00095     char getBase();
00096     char getPhredQualityChar();
00097     int  getPhredQualityScore();
00098 
00099     int  getDeletedBaseCount();
00100     int  getInsertedBasesOverlapping();
00101 
00102     bool hasInsertionFollowing();
00103 };
00104 
00105 ///
00106 /// Contain all pertinent information from overlapping
00107 /// SAM records that START an insertion at this point.
00108 ///
00109 class ReadInsertion {
00110     SamRecord *_samRecord;
00111     // More private state is needed - e.g. offset and length into read.
00112     // But also cigar string parsing state information - probably need
00113     // a small common state class used here and in ReadBase
00114 public:
00115     bool set(SamRecord &r);
00116 
00117     /// STL single copy access:
00118     bool        getQualities(std::string &phredQualities);
00119     bool        getBases(std::string &bases);
00120 
00121     /// zero copy access:
00122     int         getQualities(char *phredQualities, int maxLength);
00123     int         getBases(char *bases, int maxLength);
00124 
00125     uint8_t     getMapQuality();
00126     uint16_t    getSAMFlag();
00127     SamRecord & getSamRecord();
00128 };
00129 
00130 ///
00131 /// class for tracking BAM file sequence data that covers markers of interest.
00132 ///
00133 /// Given a sorted BAM file, allow us to scan over the file, either base by
00134 /// base or marker by marker in ascending order, and obtain a list of
00135 /// bases from the BAM file at the corresponding position.
00136 ///
00137 /// In the event that an insertion is noticed after the marker of interest,
00138 /// signal that in getPileup, and allow the data to be returned by calling
00139 /// getInsertions().
00140 ///
00141 ///
00142 class SequenceCoverageReader {
00143 
00144 private:
00145     SamFile&    _samFile;
00146     int         _referenceID;
00147     uint32_t    _position;
00148     bool        _useIndex;
00149     std::list<SamRecord *>    _samRecords;    // list of ptrs to records that overlap the last read pileup position
00150     std::queue<SamRecord *> _unused;         // fifo to recliam pre-allocated records from
00151 
00152 public:
00153 
00154     SequenceCoverageReader(SamFile &samFile) :
00155         _samFile(samFile),
00156         _referenceID(0),
00157         _position(0),
00158         _useIndex(true)
00159         { ; }
00160 
00161     void useIndex(bool u=true) { _useIndex = u; }
00162 
00163     /// \brief getBases returns a pileup vector of the same
00164     /// length as the number of reads that overlaps the given
00165     /// chromosome and position
00166     /// 
00167     /// \param referenceID  identifies the chromosome in which
00168     /// to retrieve bases from.
00169     /// 
00170     /// \param position  indicates the 1-based offset of the 
00171     /// base of interest on that chromosome.  This number is
00172     /// relative to the genome reference that the BAM file
00173     /// was aligned against.
00174     /// 
00175     /// \param readBases a caller allocated vector of of readBase
00176     /// objects
00177     /// 
00178     /// \param basesInserted bool that indicates if getInsertions() will
00179     /// return a non-zero vector of bases that were inserted AFTER
00180     /// this position.  Since reads may truncate this insertion,
00181     /// they may be different lengths.
00182     ///
00183     /// \return true if more data exists in the BAM file, false otherwise
00184     ///
00185     bool getBases(int32_t referenceID, uint32_t position, std::vector<ReadBase> &readBases, bool &basesInserted);
00186 
00187     bool getInsertedBases(int referenceID, uint32_t position, std::vector<ReadInsertion> &readInsertions);
00188 
00189 };
00190 
00191 
00192 #endif
Generated on Wed Nov 17 15:38:27 2010 for StatGen Software by  doxygen 1.6.3