BgzfFileTypeRecovery.cpp

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifdef __ZLIB_AVAILABLE__
00019 
00020 #include "BgzfFileTypeRecovery.h"
00021 
00022 #include <stdio.h>
00023 #include <stdint.h>
00024 
00025 #include <sys/types.h>
00026 #include <sys/stat.h>
00027 #include <fcntl.h>
00028 #include <stdlib.h>
00029 #include <string.h>
00030 #include <unistd.h>
00031 #include <zlib.h>
00032 
00033 #include <fstream>
00034 #include <iostream>
00035 #include <stdexcept>
00036 #include <vector>
00037 
00038 #pragma pack(push,1)
00039 
00040 #define debug false
00041 
00042 class RecoveryGzipHeader {
00043 private:
00044     uint8_t m_ID1;
00045     uint8_t m_ID2;
00046     uint8_t m_CM;
00047     uint8_t m_FLG;
00048     uint32_t m_MTIME;
00049     uint8_t m_XFL;
00050     uint8_t m_OS;
00051     uint16_t m_XLEN;
00052 public:
00053     RecoveryGzipHeader() :
00054         m_ID1(0),
00055         m_ID2(0),
00056         m_CM(0),
00057         m_FLG(0),
00058         m_MTIME(0),
00059         m_XFL(0),
00060         m_OS(0),
00061         m_XLEN(0)
00062     {;}
00063 
00064     void defaults() {
00065         m_ID1 = 31;
00066         m_ID2 = 139;
00067         m_CM = 8;
00068         m_FLG = 4;
00069         m_MTIME = 0;
00070         m_XFL = 0;
00071         m_OS = 255;
00072         m_XLEN = 6;
00073     }
00074     uint8_t ID1() {return m_ID1;}
00075     uint8_t ID2() {return m_ID2;}
00076     uint8_t CM() {return m_CM;}
00077     uint8_t FLG() {return m_FLG;}
00078     uint32_t MTIME() {return m_MTIME;}
00079     uint8_t XFL() {return m_XFL;}
00080     uint8_t OS() {return m_OS;}
00081     uint16_t XLEN() {return m_XLEN;}
00082     bool sane() {
00083         return (m_ID1==31 && m_ID2==139 && m_CM==8 && m_FLG==4 && m_MTIME==0 && m_XFL == 0 && m_OS == 255 && m_XLEN==6);
00084     }
00085 };
00086 
00087 class BGZFHeader : public RecoveryGzipHeader {
00088 private:
00089     uint8_t m_SI1;
00090     uint8_t m_SI2;
00091     uint16_t m_SLEN;    // little endian
00092     uint16_t m_BSIZE;   // little endian
00093 public:
00094     BGZFHeader(
00095             uint8_t m_SI1 = 'B',
00096             uint8_t m_SI2 = 'C',
00097             uint16_t m_SLEN = 2,
00098             uint16_t m_BSIZE = 0
00099     ) : m_SI1(m_SI1), m_SI2(m_SI2), m_SLEN(m_SLEN), m_BSIZE(m_BSIZE) {;}
00100     uint8_t SI1() {return m_SI1;}
00101     uint8_t SI2() {return m_SI2;}
00102     uint16_t SLEN() {return m_SLEN;}
00103     uint16_t BSIZE() {return m_BSIZE;}
00104     bool sane() {
00105         return RecoveryGzipHeader::sane() && 
00106             (m_SI1=='B' && m_SI2=='C' && m_SLEN==2 && m_BSIZE > sizeof(BGZFHeader));
00107     }
00108 };
00109 
00110 #pragma pack(pop)
00111 
00112 //
00113 // PeekaheadBuffer allows non-destructive peekahead and resyncing
00114 // after read errors when the underlying stream has signatures in the
00115 // data that allow it.
00116 //
00117 // In addition, it has a peek() capability to allow
00118 // the caller to look ahead in the stream to see
00119 // a certain number of bytes before actually consuming them.
00120 //
00121 // The intent is that this class behave as something of a poor
00122 // man's FIFO - with the cost of buffer movement when data is removed.
00123 //
00124 // This is far from ideal, but we basically are moving data around
00125 // when allowing arbitrary peekahead regardless.
00126 //
00127 // The basis for the design is the fact that most read calls to
00128 // various streams at best allow a single character to be peeked
00129 // at, and secondly, do not allow for recovery after an underfling
00130 // framing error occurs.
00131 //
00132 // That is, getchar()/putchar/ungetchar() support a single byte
00133 // peek.  This may be fine for simply parsing applications, but here
00134 // we need to look at blocks up to 64K or more in size to search
00135 // for signatures while re-synchronizing on the underlying stream.
00136 //
00137 class PeekaheadBuffer : public std::vector<uint8_t> {
00138 
00139 protected:
00140     ssize_t m_startPosition;  // start of fresh data
00141 
00142 public:
00143     enum ReturnCode {
00144         endOfFile = -1,
00145         reSync = 0,
00146         ok = 1
00147     };
00148 
00149     ssize_t startPosition() {return m_startPosition;}
00150 
00151 private:
00152     //
00153     // when remaining data is 1/8 the size of the full
00154     // buffer, shift it back down to the start.
00155     //
00156     // for use by read(), which will consume data from the buffer.
00157     //
00158     void shiftData() {
00159         if(dataRemaining() < (ssize_t) (std::vector<uint8_t>::size() / 8) ) {
00160             erase(begin(), begin() + m_startPosition);
00161             m_startPosition = 0;
00162         }
00163     }
00164     // called when read reports an error for some
00165     // reason - 
00166     virtual ReturnCode sync();
00167 public:
00168     PeekaheadBuffer();
00169     virtual ~PeekaheadBuffer();
00170 
00171     // return the amount of unused data:
00172     ssize_t dataRemaining();
00173 
00174     //
00175     // overload size() to throw an exception - too confusing otherwise
00176 //    size_t size() {abort();}
00177 
00178     //
00179     // just populate data in buffer from stream - not generic
00180     //
00181     // XXX note that it simply ensures that count bytes of data
00182     // are actually loaded into the buffer - if that amount of
00183     // data (or more) is present, this call is a NOP.
00184     //
00185     virtual ReturnCode readahead(ssize_t count) = 0;
00186 
00187     // read is generic.
00188     // remove data from our buffer - call non-generic readahead to populate data.
00189     ReturnCode read(uint8_t *buffer, ssize_t count) {
00190         ReturnCode rc;
00191 
00192         rc = readahead(count);
00193 
00194         if(rc == ok) {
00195             uint8_t *src = &(*begin()) + m_startPosition;
00196             uint8_t *dest = buffer;
00197 
00198             memcpy(dest, src, count);
00199 
00200             m_startPosition += count;  // consume data
00201 
00202             // recover space if wasting too much:
00203             shiftData();
00204         } else if(rc == reSync) {
00205             // peek puked - CRC error, other errors, see if we can sync forwards
00206             return reSync;
00207         }  else {
00208             // failed to get needed data - premature EOF, I guess
00209             return endOfFile;
00210         }
00211 
00212         return ok;
00213     }
00214 
00215 };
00216 
00217 PeekaheadBuffer::PeekaheadBuffer() : m_startPosition(0)
00218 {
00219 }
00220 
00221 PeekaheadBuffer::~PeekaheadBuffer()
00222 {
00223 }
00224 
00225 PeekaheadBuffer::ReturnCode PeekaheadBuffer::sync() {
00226     clear();
00227     return ok;
00228 }
00229 
00230 ssize_t PeekaheadBuffer::dataRemaining()
00231 {
00232     return std::vector<uint8_t>::size() - m_startPosition;
00233 }
00234 
00235 
00236 // peekahead buffered file reader class
00237 class FileReader : public PeekaheadBuffer {
00238     FILE    *m_stream;
00239 public:
00240     FileReader();
00241     ~FileReader();
00242     FileReader(FILE *stream);
00243     PeekaheadBuffer::ReturnCode readahead(ssize_t count);
00244     FILE *stream() {return m_stream;}
00245     bool eof() {return m_stream ? feof(m_stream) : false;}
00246 };
00247 
00248 FileReader::FileReader()
00249 {
00250     m_stream = NULL;
00251 }
00252 
00253 FileReader::FileReader(FILE *stream) : m_stream(stream)
00254 {
00255 }
00256 
00257 FileReader::~FileReader()
00258 {
00259     fclose(m_stream);
00260     m_stream = NULL;
00261 }
00262 
00263 //
00264 // fill buffer until we have count bytes of valid
00265 // data.
00266 //
00267 // need to detect error and eof and return appropriate values.
00268 //
00269 PeekaheadBuffer::ReturnCode FileReader::readahead(ssize_t count)
00270 {
00271     uint8_t buffer[4096];
00272     while(dataRemaining() < count) {
00273         int bytesRead = fread(buffer, 1, sizeof(buffer), m_stream);
00274         if(bytesRead==0) {
00275             if(ferror(m_stream)) {
00276                 return reSync;
00277             }
00278             // ain't getting no more data...
00279             return endOfFile;
00280         }
00281 #if 0
00282         fprintf(stderr, "\n\n");
00283         int possible = -1;
00284         for(int i=0;i<bytesRead;i+=16) {
00285             fprintf(stderr,"%08x: ", i);
00286             for(int j=0;j<16;j++) {
00287                 if(buffer[i+j]==31 && buffer[i+j+1]==139) {
00288                     possible = i+j;
00289                 }
00290                 fprintf(stderr,"%02x ", buffer[i+j]);
00291             }
00292             fprintf(stderr, "\n");
00293         }
00294         if(possible>0) {
00295             fprintf(stderr,"possible signature at %08x\n", possible);
00296         }
00297 #endif
00298         insert(end(), &buffer[0], &buffer[0] + bytesRead);
00299     }
00300     return ok;
00301 }
00302 
00303 class BGZFReader : public PeekaheadBuffer {
00304     FileReader  m_fileReader;
00305 
00306 public:
00307 
00308     BGZFReader(FILE *stream) : m_fileReader(stream) {;}
00309 
00310     PeekaheadBuffer::ReturnCode readahead(ssize_t count);
00311 
00312     //
00313     // This will be reading data, and needs to return EOF, etc
00314     //
00315     ReturnCode sync() {
00316         // my internal data is now bad, so we'll scan ahead seeing
00317         // if we can find a good header
00318         clear();
00319         PeekaheadBuffer::ReturnCode rc;
00320         while((rc = m_fileReader.readahead(sizeof(BGZFHeader)))==ok ) {
00321             BGZFHeader *header;
00322             if(rc==endOfFile) return rc;
00323             // a rc==reSync is ok provided readahead still ensures that header is present
00324             void *src = &(*(m_fileReader.begin())) + m_fileReader.startPosition();
00325             header = (BGZFHeader *) src;
00326             if(header->sane()) {
00327                 if(debug) std::cerr << "BGZFReader::sync returning reSync\n";
00328                 return reSync;  // tell caller they need to sync up
00329             }
00330             // consume a byte, then see if we're at a valid block header
00331             uint8_t throwAwayBuffer;
00332             rc = m_fileReader.read(&throwAwayBuffer, 1);
00333         }
00334         return rc;
00335     }
00336     FILE *stream() {return m_fileReader.stream();}
00337 
00338     bool eof() {return dataRemaining()==0 && m_fileReader.eof();}
00339 
00340 };
00341 
00342 PeekaheadBuffer::ReturnCode BGZFReader::readahead(ssize_t count)
00343 {
00344     BGZFHeader header;
00345     // size of inflateBuffer can be determined from ISIZE, I think
00346     uint8_t inflateBuffer[64*1024];
00347     uint8_t gzipBuffer[64*1024+1];
00348 
00349     while(dataRemaining() < count) {
00350         static int loopCount = 0;
00351 
00352         if(debug) std::cerr << "BGZFReader::readahead loopcount = " << loopCount++ << "\n";
00353 
00354         // here we actually read data:
00355         //  read what should be the header
00356         //  verify the header
00357         //  read the remainder of the block
00358         //  check the CRC validity or perhaps just call unzip
00359         //
00360         // XXX the sizeof(header) is wrong:
00361         PeekaheadBuffer::ReturnCode rc = m_fileReader.read((uint8_t *) (&header), sizeof(header));
00362 
00363         if(rc == endOfFile) {
00364             return endOfFile;
00365         }
00366 
00367         // if we have a bad header, start looking forward for a good one,
00368         if(!header.sane()) {
00369             // sync does not consume the next good header, it simply syncs()
00370             // the data stream to the next believed good BGZF header:
00371             if(debug) std::cerr << "BGZFReader::readahead found corrupt BGZF header - now calling sync()\n";
00372             rc = sync();
00373             //
00374             // even though we can now decompress, we need to tell the caller
00375             // what is up before they call for more data (caller needs to
00376             // sync its own record stream):
00377             return rc;
00378         }
00379 
00380         // Read the remainder of the block.
00381         // BSIZE is size of the entire block - 1, so compensate.
00382         rc = m_fileReader.read((uint8_t *) &gzipBuffer, header.BSIZE() + 1 - sizeof(header));
00383 
00384         if(rc == reSync) {
00385             if(debug) std::cerr << "BGZFReader::readahead got incomplete BGZF read - now calling sync()\n";
00386             sync();
00387             return reSync;
00388         }
00389 
00390         //
00391         // we read a header, but our attempt to read more data ended early,
00392         // so best to just return EOF
00393         //
00394         if(rc == endOfFile) {
00395             return rc;
00396         }
00397 
00398         PeekaheadBuffer::ReturnCode bgzf_rc = ok;
00399         // zs.opaque is set when zalloc is NULL
00400         //
00401         // NB: zlib inflateInit2() has valgrind errors
00402         // in versions <1.2.4 - those can be ignored.
00403         //
00404         z_stream zs;
00405         zs.zalloc = NULL;
00406         zs.zfree = NULL;
00407         zs.next_in = gzipBuffer;
00408         zs.avail_in = header.BSIZE() - 16;  // XXX need to check docs for inflate
00409         zs.next_out = inflateBuffer;
00410         zs.avail_out = sizeof(inflateBuffer);
00411 
00412         // -15 --> raw inflate - don't look for gzip or zlib header
00413         // This can be optimized - inflateInit2 does a malloc of
00414         // approximately 10K (sizeof(inflate_state))
00415         if(inflateInit2(&zs, -15) != Z_OK) {
00416             bgzf_rc = reSync;
00417             if(debug) std::cerr << "BGZFReader::readahead - inflateInit2 failed (out of memory?)\n";
00418             // XXX fatal?
00419         }
00420         if(bgzf_rc==ok && inflate(&zs, Z_FINISH) != Z_STREAM_END) {
00421             bgzf_rc = reSync;
00422             if(debug) std::cerr << "BGZFReader::readahead - inflate failed (bad data), calling sync()\n";
00423         }
00424 
00425         if(bgzf_rc == ok) {
00426             if(inflateEnd(&zs) == Z_OK) {
00427                 // do something with zs.total_out
00428                 if(debug) std::cout << "hey, got data!  zs.total_out == " << zs.total_out << "\n";
00429 
00430                 // append the newly decompressed data
00431                 insert(end(), &inflateBuffer[0], &inflateBuffer[0] + zs.total_out);
00432             } else {
00433                 // seems exceptionall unlikely, but check this error case too
00434                 bgzf_rc = reSync;
00435                 if(debug) std::cerr << "BGZFReader::readahead - inflateInit2 failed (out of memory?)\n";
00436                 // XXX fatal?
00437             }
00438         }
00439 
00440         if(bgzf_rc != ok) {
00441             inflateEnd(&zs);
00442             sync();
00443             return bgzf_rc;
00444         }
00445 
00446         // may need to get more data - loop back till all is complete
00447     }
00448 
00449     return ok;
00450 
00451 }
00452 
00453 
00454 #if 0
00455 void testBGZFBuffer()
00456 {
00457     BGZFReader b(stdin);
00458     std::vector<uint8_t>::iterator position;
00459     BGZFReader::ReturnCode rc;
00460 
00461     std::cout << "size = " << b.dataRemaining() << "\n";
00462 
00463     //
00464     // this should:
00465     //  decompress a BGZF block, populating the buffer with
00466     //  unzipped data, possibly returning a BGZFBuffer::ReturnCode of 
00467     //  resync if it turns out the BGZF data was interrupted by bad
00468     //  CRC checks.
00469     //
00470     rc = b.readahead(64);
00471     std::cout << "rc = " << rc << " - expect ok (1)\n";
00472     std::cout << "size (expect 64) = " << b.size() << "\n";
00473 }
00474 
00475 
00476 int main(int argc, const char **argv)
00477 {
00478     testBGZFBuffer();
00479 }
00480 #endif
00481 
00482 
00483 
00484 int BgzfFileTypeRecovery::close()
00485 {
00486     if(bgzfReader) delete bgzfReader;
00487     bgzfReader = NULL;
00488     return true;
00489 }
00490 
00491 
00492 BgzfFileTypeRecovery::BgzfFileTypeRecovery(const char * filename, const char * mode)
00493 {
00494     if(tolower(mode[0])=='r') {
00495         FILE *f = fopen(filename,"r");
00496         bgzfReader = new BGZFReader(f);
00497     } else {
00498         // die for now
00499         if(debug) std::cerr << "Unable to open " << filename << " in mode " << mode << ".\n";
00500         close();
00501     }
00502 }
00503 
00504 //
00505 // Why is this ever called?
00506 //
00507 bool BgzfFileTypeRecovery::operator == (void * rhs)
00508 {
00509     throw std::logic_error("BgzfFileTypeRecovery::operator == is dangerous - do not use");
00510     return false;
00511 }
00512 
00513 bool BgzfFileTypeRecovery::operator != (void * rhs)
00514 {
00515     throw std::logic_error("BgzfFileTypeRecovery::operator != is dangerous - do not use");
00516     return false;
00517 }
00518 
00519 int BgzfFileTypeRecovery::eof()
00520 {
00521     return bgzfReader->eof();
00522 }
00523 
00524 unsigned int BgzfFileTypeRecovery::write(const void * buffer, unsigned int size)
00525 {
00526     // currently unsupported
00527     return 0;
00528 }
00529 
00530 int BgzfFileTypeRecovery::read(void * buffer, unsigned int size)
00531 {
00532 
00533     if(bgzfReader == NULL) {
00534         return 0;
00535     }
00536 
00537     PeekaheadBuffer::ReturnCode rc = bgzfReader->read((uint8_t *) buffer, size);
00538    //     endOfFile = -1,
00539 //        reSync = 0,
00540 //        ok = 1
00541     switch(rc) {
00542         case PeekaheadBuffer::endOfFile:
00543             // set a flag?
00544             return 0;
00545         case PeekaheadBuffer::reSync:
00546             // we could encode more info in the exception message here:
00547             if(debug) std::cerr << "throwing BGZF sync exception\n";
00548             throw std::runtime_error("BGZF stream resync");
00549         case PeekaheadBuffer::ok:
00550             //
00551             // in bgzfReader, we always are ensured we
00552             // get the full amount of the read, otherwise
00553             // an error is thrown.
00554             //
00555             return size;
00556     }
00557     // NOTREACHED
00558     return 0;
00559 }
00560 
00561 int64_t BgzfFileTypeRecovery::tell()
00562 {
00563     // currently unsupported
00564     return 0;
00565 }
00566 
00567 bool BgzfFileTypeRecovery::seek(int64_t offset, int origin)
00568 {
00569     // currently unsupported
00570     return 0;
00571 }
00572 
00573 
00574 bool BgzfFileTypeRecovery::attemptRecoverySync(bool (*checkSignature)(void *data) , int length)
00575 {
00576     //
00577     // creep along a byte at a time, checking for signature.
00578     //
00579     // possibly slow.  should only need to scan ahead < 64K bytes
00580     // or so, however, so should recover in "reasonable" time.
00581     //
00582     while( bgzfReader->readahead(length) == PeekaheadBuffer::ok) {
00583         char ch;
00584         void *src = &(*(bgzfReader->begin())) + bgzfReader->startPosition();
00585 
00586         //
00587         // readahead ensures we have 'length' bytes of
00588         // data to check that is valid in the buffer.
00589         //
00590         if((*checkSignature)(src)) return true;
00591         PeekaheadBuffer::ReturnCode rc  = bgzfReader->read((uint8_t *) &ch, 1);
00592         if(rc!=PeekaheadBuffer::ok) return false;
00593         // we consumed a byte, so go back to top of loop,
00594         // resume filling buffer (if need be) and re-check
00595     }
00596 
00597 
00598     return false;
00599 }
00600 
00601 #endif