libStatGen Software  1
BgzfFileTypeRecovery.cpp
00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifdef __ZLIB_AVAILABLE__
00019 
00020 #include "BgzfFileTypeRecovery.h"
00021 
00022 #include <stdio.h>
00023 #include <stdint.h>
00024 
00025 #include <sys/types.h>
00026 #include <sys/stat.h>
00027 #include <fcntl.h>
00028 #include <stdlib.h>
00029 #include <string.h>
00030 #include <zlib.h>
00031 
00032 #include <fstream>
00033 #include <iostream>
00034 #include <stdexcept>
00035 #include <vector>
00036 
00037 #pragma pack(push,1)
00038 
00039 #define debug false
00040 
00041 class RecoveryGzipHeader {
00042 private:
00043     uint8_t m_ID1;
00044     uint8_t m_ID2;
00045     uint8_t m_CM;
00046     uint8_t m_FLG;
00047     uint32_t m_MTIME;
00048     uint8_t m_XFL;
00049     uint8_t m_OS;
00050     uint16_t m_XLEN;
00051 public:
00052     RecoveryGzipHeader() :
00053         m_ID1(0),
00054         m_ID2(0),
00055         m_CM(0),
00056         m_FLG(0),
00057         m_MTIME(0),
00058         m_XFL(0),
00059         m_OS(0),
00060         m_XLEN(0)
00061     {;}
00062 
00063     void defaults() {
00064         m_ID1 = 31;
00065         m_ID2 = 139;
00066         m_CM = 8;
00067         m_FLG = 4;
00068         m_MTIME = 0;
00069         m_XFL = 0;
00070         m_OS = 255;
00071         m_XLEN = 6;
00072     }
00073     uint8_t ID1() {return m_ID1;}
00074     uint8_t ID2() {return m_ID2;}
00075     uint8_t CM() {return m_CM;}
00076     uint8_t FLG() {return m_FLG;}
00077     uint32_t MTIME() {return m_MTIME;}
00078     uint8_t XFL() {return m_XFL;}
00079     uint8_t OS() {return m_OS;}
00080     uint16_t XLEN() {return m_XLEN;}
00081     bool sane() {
00082         return (m_ID1==31 && m_ID2==139 && m_CM==8 && m_FLG==4 && m_MTIME==0 && m_XFL == 0 && m_OS == 255 && m_XLEN==6);
00083     }
00084 };
00085 
00086 class BGZFHeader : public RecoveryGzipHeader {
00087 private:
00088     uint8_t m_SI1;
00089     uint8_t m_SI2;
00090     uint16_t m_SLEN;    // little endian
00091     uint16_t m_BSIZE;   // little endian
00092 public:
00093     BGZFHeader(
00094             uint8_t m_SI1 = 'B',
00095             uint8_t m_SI2 = 'C',
00096             uint16_t m_SLEN = 2,
00097             uint16_t m_BSIZE = 0
00098     ) : m_SI1(m_SI1), m_SI2(m_SI2), m_SLEN(m_SLEN), m_BSIZE(m_BSIZE) {;}
00099     uint8_t SI1() {return m_SI1;}
00100     uint8_t SI2() {return m_SI2;}
00101     uint16_t SLEN() {return m_SLEN;}
00102     uint16_t BSIZE() {return m_BSIZE;}
00103     bool sane() {
00104         return RecoveryGzipHeader::sane() && 
00105             (m_SI1=='B' && m_SI2=='C' && m_SLEN==2 && m_BSIZE > sizeof(BGZFHeader));
00106     }
00107 };
00108 
00109 #pragma pack(pop)
00110 
00111 //
00112 // PeekaheadBuffer allows non-destructive peekahead and resyncing
00113 // after read errors when the underlying stream has signatures in the
00114 // data that allow it.
00115 //
00116 // In addition, it has a peek() capability to allow
00117 // the caller to look ahead in the stream to see
00118 // a certain number of bytes before actually consuming them.
00119 //
00120 // The intent is that this class behave as something of a poor
00121 // man's FIFO - with the cost of buffer movement when data is removed.
00122 //
00123 // This is far from ideal, but we basically are moving data around
00124 // when allowing arbitrary peekahead regardless.
00125 //
00126 // The basis for the design is the fact that most read calls to
00127 // various streams at best allow a single character to be peeked
00128 // at, and secondly, do not allow for recovery after an underfling
00129 // framing error occurs.
00130 //
00131 // That is, getchar()/putchar/ungetchar() support a single byte
00132 // peek.  This may be fine for simply parsing applications, but here
00133 // we need to look at blocks up to 64K or more in size to search
00134 // for signatures while re-synchronizing on the underlying stream.
00135 //
00136 class PeekaheadBuffer : public std::vector<uint8_t> {
00137 
00138 protected:
00139     ssize_t m_startPosition;  // start of fresh data
00140 
00141 public:
00142     enum ReturnCode {
00143         endOfFile = -1,
00144         reSync = 0,
00145         ok = 1
00146     };
00147 
00148     ssize_t startPosition() {return m_startPosition;}
00149 
00150 private:
00151     //
00152     // when remaining data is 1/8 the size of the full
00153     // buffer, shift it back down to the start.
00154     //
00155     // for use by read(), which will consume data from the buffer.
00156     //
00157     void shiftData() {
00158         if(dataRemaining() < (ssize_t) (std::vector<uint8_t>::size() / 8) ) {
00159             erase(begin(), begin() + m_startPosition);
00160             m_startPosition = 0;
00161         }
00162     }
00163     // called when read reports an error for some
00164     // reason -
00165     virtual ReturnCode sync();
00166 public:
00167     PeekaheadBuffer();
00168     virtual ~PeekaheadBuffer();
00169 
00170     // return the amount of unused data:
00171     ssize_t dataRemaining();
00172 
00173     //
00174     // overload size() to throw an exception - too confusing otherwise
00175 //    size_t size() {abort();}
00176 
00177     //
00178     // just populate data in buffer from stream - not generic
00179     //
00180     // XXX note that it simply ensures that count bytes of data
00181     // are actually loaded into the buffer - if that amount of
00182     // data (or more) is present, this call is a NOP.
00183     //
00184     virtual ReturnCode readahead(ssize_t count) = 0;
00185 
00186     // read is generic.
00187     // remove data from our buffer - call non-generic readahead to populate data.
00188     ReturnCode read(uint8_t *buffer, ssize_t count) {
00189         ReturnCode rc;
00190 
00191         rc = readahead(count);
00192 
00193         if(rc == ok) {
00194             uint8_t *src = &(*begin()) + m_startPosition;
00195             uint8_t *dest = buffer;
00196 
00197             memcpy(dest, src, count);
00198 
00199             m_startPosition += count;  // consume data
00200 
00201             // recover space if wasting too much:
00202             shiftData();
00203         } else if(rc == reSync) {
00204             // peek puked - CRC error, other errors, see if we can sync forwards
00205             return reSync;
00206         }  else {
00207             // failed to get needed data - premature EOF, I guess
00208             return endOfFile;
00209         }
00210 
00211         return ok;
00212     }
00213 
00214 };
00215 
00216 PeekaheadBuffer::PeekaheadBuffer() : m_startPosition(0)
00217 {
00218 }
00219 
00220 PeekaheadBuffer::~PeekaheadBuffer()
00221 {
00222 }
00223 
00224 PeekaheadBuffer::ReturnCode PeekaheadBuffer::sync() {
00225     clear();
00226     return ok;
00227 }
00228 
00229 ssize_t PeekaheadBuffer::dataRemaining()
00230 {
00231     return std::vector<uint8_t>::size() - m_startPosition;
00232 }
00233 
00234 
00235 // peekahead buffered file reader class
00236 class FileReader : public PeekaheadBuffer {
00237     FILE    *m_stream;
00238 public:
00239     FileReader();
00240     ~FileReader();
00241     FileReader(FILE *stream);
00242     PeekaheadBuffer::ReturnCode readahead(ssize_t count);
00243     FILE *stream() {return m_stream;}
00244     bool eof() {return m_stream ? feof(m_stream) : false;}
00245 };
00246 
00247 FileReader::FileReader()
00248 {
00249     m_stream = NULL;
00250 }
00251 
00252 FileReader::FileReader(FILE *stream) : m_stream(stream)
00253 {
00254 }
00255 
00256 FileReader::~FileReader()
00257 {
00258     fclose(m_stream);
00259     m_stream = NULL;
00260 }
00261 
00262 //
00263 // fill buffer until we have count bytes of valid
00264 // data.
00265 //
00266 // need to detect error and eof and return appropriate values.
00267 //
00268 PeekaheadBuffer::ReturnCode FileReader::readahead(ssize_t count)
00269 {
00270     uint8_t buffer[4096];
00271     while(dataRemaining() < count) {
00272         int bytesRead = fread(buffer, 1, sizeof(buffer), m_stream);
00273         if(bytesRead==0) {
00274             if(ferror(m_stream)) {
00275                 return reSync;
00276             }
00277             // ain't getting no more data...
00278             return endOfFile;
00279         }
00280 #if 0
00281         fprintf(stderr, "\n\n");
00282         int possible = -1;
00283         for(int i=0;i<bytesRead;i+=16) {
00284             fprintf(stderr,"%08x: ", i);
00285             for(int j=0;j<16;j++) {
00286                 if(buffer[i+j]==31 && buffer[i+j+1]==139) {
00287                     possible = i+j;
00288                 }
00289                 fprintf(stderr,"%02x ", buffer[i+j]);
00290             }
00291             fprintf(stderr, "\n");
00292         }
00293         if(possible>0) {
00294             fprintf(stderr,"possible signature at %08x\n", possible);
00295         }
00296 #endif
00297         insert(end(), &buffer[0], &buffer[0] + bytesRead);
00298     }
00299     return ok;
00300 }
00301 
00302 class BGZFReader : public PeekaheadBuffer {
00303     FileReader  m_fileReader;
00304 
00305 public:
00306 
00307     BGZFReader(FILE *stream) : m_fileReader(stream) {;}
00308 
00309     PeekaheadBuffer::ReturnCode readahead(ssize_t count);
00310 
00311     //
00312     // This will be reading data, and needs to return EOF, etc
00313     //
00314     ReturnCode sync() {
00315         // my internal data is now bad, so we'll scan ahead seeing
00316         // if we can find a good header
00317         clear();
00318         PeekaheadBuffer::ReturnCode rc;
00319         while((rc = m_fileReader.readahead(sizeof(BGZFHeader)))==ok ) {
00320             BGZFHeader *header;
00321             if(rc==endOfFile) return rc;
00322             // a rc==reSync is ok provided readahead still ensures that header is present
00323             void *src = &(*(m_fileReader.begin())) + m_fileReader.startPosition();
00324             header = (BGZFHeader *) src;
00325             if(header->sane()) {
00326                 if(debug) std::cerr << "BGZFReader::sync returning reSync\n";
00327                 return reSync;  // tell caller they need to sync up
00328             }
00329             // consume a byte, then see if we're at a valid block header
00330             uint8_t throwAwayBuffer;
00331             rc = m_fileReader.read(&throwAwayBuffer, 1);
00332         }
00333         return rc;
00334     }
00335     FILE *stream() {return m_fileReader.stream();}
00336 
00337     bool eof() {return dataRemaining()==0 && m_fileReader.eof();}
00338 
00339 };
00340 
00341 PeekaheadBuffer::ReturnCode BGZFReader::readahead(ssize_t count)
00342 {
00343     BGZFHeader header;
00344     // size of inflateBuffer can be determined from ISIZE, I think
00345     uint8_t inflateBuffer[64*1024];
00346     uint8_t gzipBuffer[64*1024+1];
00347 
00348     while(dataRemaining() < count) {
00349         static int loopCount = 0;
00350 
00351         if(debug) std::cerr << "BGZFReader::readahead loopcount = " << loopCount++ << "\n";
00352 
00353         // here we actually read data:
00354         //  read what should be the header
00355         //  verify the header
00356         //  read the remainder of the block
00357         //  check the CRC validity or perhaps just call unzip
00358         //
00359         // XXX the sizeof(header) is wrong:
00360         PeekaheadBuffer::ReturnCode rc = m_fileReader.read((uint8_t *) (&header), sizeof(header));
00361 
00362         if(rc == endOfFile) {
00363             return endOfFile;
00364         }
00365 
00366         // if we have a bad header, start looking forward for a good one,
00367         if(!header.sane()) {
00368             // sync does not consume the next good header, it simply syncs()
00369             // the data stream to the next believed good BGZF header:
00370             if(debug) std::cerr << "BGZFReader::readahead found corrupt BGZF header - now calling sync()\n";
00371             rc = sync();
00372             //
00373             // even though we can now decompress, we need to tell the caller
00374             // what is up before they call for more data (caller needs to
00375             // sync its own record stream):
00376             return rc;
00377         }
00378 
00379         // Read the remainder of the block.
00380         // BSIZE is size of the entire block - 1, so compensate.
00381         rc = m_fileReader.read((uint8_t *) &gzipBuffer, header.BSIZE() + 1 - sizeof(header));
00382 
00383         if(rc == reSync) {
00384             if(debug) std::cerr << "BGZFReader::readahead got incomplete BGZF read - now calling sync()\n";
00385             sync();
00386             return reSync;
00387         }
00388 
00389         //
00390         // we read a header, but our attempt to read more data ended early,
00391         // so best to just return EOF
00392         //
00393         if(rc == endOfFile) {
00394             return rc;
00395         }
00396 
00397         PeekaheadBuffer::ReturnCode bgzf_rc = ok;
00398         // zs.opaque is set when zalloc is NULL
00399         //
00400         // NB: zlib inflateInit2() has valgrind errors
00401         // in versions <1.2.4 - those can be ignored.
00402         //
00403         z_stream zs;
00404         zs.zalloc = NULL;
00405         zs.zfree = NULL;
00406         zs.next_in = gzipBuffer;
00407         zs.avail_in = header.BSIZE() - 16;  // XXX need to check docs for inflate
00408         zs.next_out = inflateBuffer;
00409         zs.avail_out = sizeof(inflateBuffer);
00410 
00411         // -15 --> raw inflate - don't look for gzip or zlib header
00412         // This can be optimized - inflateInit2 does a malloc of
00413         // approximately 10K (sizeof(inflate_state))
00414         if(inflateInit2(&zs, -15) != Z_OK) {
00415             bgzf_rc = reSync;
00416             if(debug) std::cerr << "BGZFReader::readahead - inflateInit2 failed (out of memory?)\n";
00417             // XXX fatal?
00418         }
00419         if(bgzf_rc==ok && inflate(&zs, Z_FINISH) != Z_STREAM_END) {
00420             bgzf_rc = reSync;
00421             if(debug) std::cerr << "BGZFReader::readahead - inflate failed (bad data), calling sync()\n";
00422         }
00423 
00424         if(bgzf_rc == ok) {
00425             if(inflateEnd(&zs) == Z_OK) {
00426                 // do something with zs.total_out
00427                 if(debug) std::cout << "hey, got data!  zs.total_out == " << zs.total_out << "\n";
00428 
00429                 // append the newly decompressed data
00430                 insert(end(), &inflateBuffer[0], &inflateBuffer[0] + zs.total_out);
00431             } else {
00432                 // seems exceptionall unlikely, but check this error case too
00433                 bgzf_rc = reSync;
00434                 if(debug) std::cerr << "BGZFReader::readahead - inflateInit2 failed (out of memory?)\n";
00435                 // XXX fatal?
00436             }
00437         }
00438 
00439         if(bgzf_rc != ok) {
00440             inflateEnd(&zs);
00441             sync();
00442             return bgzf_rc;
00443         }
00444 
00445         // may need to get more data - loop back till all is complete
00446     }
00447 
00448     return ok;
00449 
00450 }
00451 
00452 
00453 #if 0
00454 void testBGZFBuffer()
00455 {
00456     BGZFReader b(stdin);
00457     std::vector<uint8_t>::iterator position;
00458     BGZFReader::ReturnCode rc;
00459 
00460     std::cout << "size = " << b.dataRemaining() << "\n";
00461 
00462     //
00463     // this should:
00464     //  decompress a BGZF block, populating the buffer with
00465     //  unzipped data, possibly returning a BGZFBuffer::ReturnCode of 
00466     //  resync if it turns out the BGZF data was interrupted by bad
00467     //  CRC checks.
00468     //
00469     rc = b.readahead(64);
00470     std::cout << "rc = " << rc << " - expect ok (1)\n";
00471     std::cout << "size (expect 64) = " << b.size() << "\n";
00472 }
00473 
00474 
00475 int main(int argc, const char **argv)
00476 {
00477     testBGZFBuffer();
00478 }
00479 #endif
00480 
00481 
00482 
00483 int BgzfFileTypeRecovery::close()
00484 {
00485     if(bgzfReader) delete bgzfReader;
00486     bgzfReader = NULL;
00487     return true;
00488 }
00489 
00490 
00491 BgzfFileTypeRecovery::BgzfFileTypeRecovery(const char * filename, const char * mode)
00492 {
00493     if(tolower(mode[0])=='r') {
00494         FILE *f = fopen(filename,"r");
00495         bgzfReader = new BGZFReader(f);
00496     } else {
00497         // die for now
00498         if(debug) std::cerr << "Unable to open " << filename << " in mode " << mode << ".\n";
00499         close();
00500     }
00501 }
00502 
00503 //
00504 // Why is this ever called?
00505 //
00506 bool BgzfFileTypeRecovery::operator == (void * rhs)
00507 {
00508     throw std::logic_error("BgzfFileTypeRecovery::operator == is dangerous - do not use");
00509     return false;
00510 }
00511 
00512 bool BgzfFileTypeRecovery::operator != (void * rhs)
00513 {
00514     throw std::logic_error("BgzfFileTypeRecovery::operator != is dangerous - do not use");
00515     return false;
00516 }
00517 
00518 int BgzfFileTypeRecovery::eof()
00519 {
00520     return bgzfReader->eof();
00521 }
00522 
00523 unsigned int BgzfFileTypeRecovery::write(const void * buffer, unsigned int size)
00524 {
00525     // currently unsupported
00526     return 0;
00527 }
00528 
00529 int BgzfFileTypeRecovery::read(void * buffer, unsigned int size)
00530 {
00531 
00532     if(bgzfReader == NULL) {
00533         return 0;
00534     }
00535 
00536     PeekaheadBuffer::ReturnCode rc = bgzfReader->read((uint8_t *) buffer, size);
00537    //     endOfFile = -1,
00538 //        reSync = 0,
00539 //        ok = 1
00540     switch(rc) {
00541         case PeekaheadBuffer::endOfFile:
00542             // set a flag?
00543             return 0;
00544         case PeekaheadBuffer::reSync:
00545             // we could encode more info in the exception message here:
00546             if(debug) std::cerr << "throwing BGZF sync exception\n";
00547             throw std::runtime_error("BGZF stream resync");
00548         case PeekaheadBuffer::ok:
00549             //
00550             // in bgzfReader, we always are ensured we
00551             // get the full amount of the read, otherwise
00552             // an error is thrown.
00553             //
00554             return size;
00555     }
00556     // NOTREACHED
00557     return 0;
00558 }
00559 
00560 int64_t BgzfFileTypeRecovery::tell()
00561 {
00562     // currently unsupported
00563     return 0;
00564 }
00565 
00566 bool BgzfFileTypeRecovery::seek(int64_t offset, int origin)
00567 {
00568     // currently unsupported
00569     return 0;
00570 }
00571 
00572 
00573 bool BgzfFileTypeRecovery::attemptRecoverySync(bool (*checkSignature)(void *data) , int length)
00574 {
00575     //
00576     // creep along a byte at a time, checking for signature.
00577     //
00578     // possibly slow.  should only need to scan ahead < 64K bytes
00579     // or so, however, so should recover in "reasonable" time.
00580     //
00581     while( bgzfReader->readahead(length) == PeekaheadBuffer::ok) {
00582         char ch;
00583         void *src = &(*(bgzfReader->begin())) + bgzfReader->startPosition();
00584 
00585         //
00586         // readahead ensures we have 'length' bytes of
00587         // data to check that is valid in the buffer.
00588         //
00589         if((*checkSignature)(src)) return true;
00590         PeekaheadBuffer::ReturnCode rc  = bgzfReader->read((uint8_t *) &ch, 1);
00591         if(rc!=PeekaheadBuffer::ok) return false;
00592         // we consumed a byte, so go back to top of loop,
00593         // resume filling buffer (if need be) and re-check
00594     }
00595 
00596 
00597     return false;
00598 }
00599 
00600 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends