BgzfFileTypeRecovery.cpp

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include "BgzfFileTypeRecovery.h"
00019 
00020 #include <stdio.h>
00021 #include <stdint.h>
00022 
00023 #include <sys/types.h>
00024 #include <sys/stat.h>
00025 #include <fcntl.h>
00026 #include <stdlib.h>
00027 #include <string.h>
00028 #include <unistd.h>
00029 #include <zlib.h>
00030 
00031 #include <fstream>
00032 #include <iostream>
00033 #include <stdexcept>
00034 #include <vector>
00035 
00036 #pragma pack(push)
00037 #pragma pack(1)
00038 
00039 #define debug false
00040 
00041 class GzipHeader {
00042 private:
00043     uint8_t m_ID1;
00044     uint8_t m_ID2;
00045     uint8_t m_CM;
00046     uint8_t m_FLG;
00047     uint32_t m_MTIME;
00048     uint8_t m_XFL;
00049     uint8_t m_OS;
00050     uint16_t m_XLEN;
00051 public:
00052     GzipHeader() {;}
00053 
00054     void defaults() {
00055         m_ID1 = 31;
00056         m_ID2 = 139;
00057         m_CM = 8;
00058         m_FLG = 4;
00059         m_MTIME = 0;
00060         m_XFL = 0;
00061         m_OS = 255;
00062         m_XLEN = 6;
00063     }
00064     uint8_t ID1() {return m_ID1;}
00065     uint8_t ID2() {return m_ID2;}
00066     uint8_t CM() {return m_CM;}
00067     uint8_t FLG() {return m_FLG;}
00068     uint32_t MTIME() {return m_MTIME;}
00069     uint8_t XFL() {return m_XFL;}
00070     uint8_t OS() {return m_OS;}
00071     uint16_t XLEN() {return m_XLEN;}
00072     bool sane() {
00073         return (m_ID1==31 && m_ID2==139 && m_CM==8 && m_FLG==4 && m_MTIME==0 && m_XFL == 0 && m_OS == 255 && m_XLEN==6);
00074     }
00075 };
00076 
00077 class BGZFHeader : public GzipHeader {
00078 private:
00079     uint8_t m_SI1;
00080     uint8_t m_SI2;
00081     uint16_t m_SLEN;    // little endian
00082     uint16_t m_BSIZE;   // little endian
00083 public:
00084     BGZFHeader(
00085             uint8_t m_SI1 = 'B',
00086             uint8_t m_SI2 = 'C',
00087             uint16_t m_SLEN = 2,
00088             uint16_t m_BSIZE = 0
00089     ) : m_SI1(m_SI1), m_SI2(m_SI2), m_SLEN(m_SLEN), m_BSIZE(m_BSIZE) {;}
00090     uint8_t SI1() {return m_SI1;}
00091     uint8_t SI2() {return m_SI2;}
00092     uint16_t SLEN() {return m_SLEN;}
00093     uint16_t BSIZE() {return m_BSIZE;}
00094     bool sane() {
00095         return GzipHeader::sane() && 
00096             (m_SI1=='B' && m_SI2=='C' && m_SLEN==2 && m_BSIZE > sizeof(BGZFHeader));
00097     }
00098 };
00099 
00100 #pragma pack(pop)
00101 
00102 //
00103 // PeekaheadBuffer allows non-destructive peekahead and resyncing
00104 // after read errors when the underlying stream has signatures in the
00105 // data that allow it.
00106 //
00107 // In addition, it has a peek() capability to allow
00108 // the caller to look ahead in the stream to see
00109 // a certain number of bytes before actually consuming them.
00110 //
00111 // The intent is that this class behave as something of a poor
00112 // man's FIFO - with the cost of buffer movement when data is removed.
00113 //
00114 // This is far from ideal, but we basically are moving data around
00115 // when allowing arbitrary peekahead regardless.
00116 //
00117 // The basis for the design is the fact that most read calls to
00118 // various streams at best allow a single character to be peeked
00119 // at, and secondly, do not allow for recovery after an underfling
00120 // framing error occurs.
00121 //
00122 // That is, getchar()/putchar/ungetchar() support a single byte
00123 // peek.  This may be fine for simply parsing applications, but here
00124 // we need to look at blocks up to 64K or more in size to search
00125 // for signatures while re-synchronizing on the underlying stream.
00126 //
00127 class PeekaheadBuffer : public std::vector<uint8_t> {
00128 
00129 protected:
00130     ssize_t m_startPosition;  // start of fresh data
00131 
00132 public:
00133     enum ReturnCode {
00134         endOfFile = -1,
00135         reSync = 0,
00136         ok = 1
00137     };
00138 
00139     ssize_t startPosition() {return m_startPosition;}
00140 
00141 private:
00142     //
00143     // when remaining data is 1/8 the size of the full
00144     // buffer, shift it back down to the start.
00145     //
00146     // for use by read(), which will consume data from the buffer.
00147     //
00148     void shiftData() {
00149         if(dataRemaining() < (ssize_t) (std::vector<uint8_t>::size() / 8) ) {
00150             erase(begin(), begin() + m_startPosition);
00151             m_startPosition = 0;
00152         }
00153     }
00154     // called when read reports an error for some
00155     // reason - 
00156     virtual ReturnCode sync();
00157 public:
00158     PeekaheadBuffer();
00159     virtual ~PeekaheadBuffer();
00160 
00161     // return the amount of unused data:
00162     ssize_t dataRemaining();
00163 
00164     //
00165     // overload size() to throw an exception - too confusing otherwise
00166 //    size_t size() {abort();}
00167 
00168     //
00169     // just populate data in buffer from stream - not generic
00170     //
00171     // XXX note that it simply ensures that count bytes of data
00172     // are actually loaded into the buffer - if that amount of
00173     // data (or more) is present, this call is a NOP.
00174     //
00175     virtual ReturnCode readahead(ssize_t count) = 0;
00176 
00177     // read is generic.
00178     // remove data from our buffer - call non-generic readahead to populate data.
00179     ReturnCode read(uint8_t *buffer, ssize_t count) {
00180         ReturnCode rc;
00181 
00182         rc = readahead(count);
00183 
00184         if(rc == ok) {
00185             uint8_t *src = &(*begin()) + m_startPosition;
00186             uint8_t *dest = buffer;
00187 
00188             memcpy(dest, src, count);
00189 
00190             m_startPosition += count;  // consume data
00191 
00192             // recover space if wasting too much:
00193             shiftData();
00194         } else if(rc == reSync) {
00195             // peek puked - CRC error, other errors, see if we can sync forwards
00196             return reSync;
00197         }  else {
00198             // failed to get needed data - premature EOF, I guess
00199             return endOfFile;
00200         }
00201 
00202         return ok;
00203     }
00204 
00205 };
00206 
00207 PeekaheadBuffer::PeekaheadBuffer() : m_startPosition(0)
00208 {
00209 }
00210 
00211 PeekaheadBuffer::~PeekaheadBuffer()
00212 {
00213 }
00214 
00215 PeekaheadBuffer::ReturnCode PeekaheadBuffer::sync() {
00216     clear();
00217     return ok;
00218 }
00219 
00220 ssize_t PeekaheadBuffer::dataRemaining()
00221 {
00222     return std::vector<uint8_t>::size() - m_startPosition;
00223 }
00224 
00225 
00226 // peekahead buffered file reader class
00227 class FileReader : public PeekaheadBuffer {
00228     FILE    *m_stream;
00229 public:
00230     FileReader();
00231     ~FileReader();
00232     FileReader(FILE *stream);
00233     PeekaheadBuffer::ReturnCode readahead(ssize_t count);
00234     FILE *stream() {return m_stream;}
00235     bool eof() {return m_stream ? feof(m_stream) : false;}
00236 };
00237 
00238 FileReader::FileReader()
00239 {
00240     m_stream = NULL;
00241 }
00242 
00243 FileReader::FileReader(FILE *stream) : m_stream(stream)
00244 {
00245 }
00246 
00247 FileReader::~FileReader()
00248 {
00249     fclose(m_stream);
00250     m_stream = NULL;
00251 }
00252 
00253 //
00254 // fill buffer until we have count bytes of valid
00255 // data.
00256 //
00257 // need to detect error and eof and return appropriate values.
00258 //
00259 PeekaheadBuffer::ReturnCode FileReader::readahead(ssize_t count)
00260 {
00261     uint8_t buffer[4096];
00262     while(dataRemaining() < count) {
00263         int bytesRead = fread(buffer, 1, sizeof(buffer), m_stream);
00264         if(bytesRead==0) {
00265             if(ferror(m_stream)) {
00266                 return reSync;
00267             }
00268             // ain't getting no more data...
00269             return endOfFile;
00270         }
00271 #if 0
00272         fprintf(stderr, "\n\n");
00273         int possible = -1;
00274         for(int i=0;i<bytesRead;i+=16) {
00275             fprintf(stderr,"%08x: ", i);
00276             for(int j=0;j<16;j++) {
00277                 if(buffer[i+j]==31 && buffer[i+j+1]==139) {
00278                     possible = i+j;
00279                 }
00280                 fprintf(stderr,"%02x ", buffer[i+j]);
00281             }
00282             fprintf(stderr, "\n");
00283         }
00284         if(possible>0) {
00285             fprintf(stderr,"possible signature at %08x\n", possible);
00286         }
00287 #endif
00288         insert(end(), &buffer[0], &buffer[0] + bytesRead);
00289     }
00290     return ok;
00291 }
00292 
00293 class BGZFReader : public PeekaheadBuffer {
00294     FileReader  m_fileReader;
00295 
00296 public:
00297 
00298     BGZFReader(FILE *stream) : m_fileReader(stream) {;}
00299 
00300     PeekaheadBuffer::ReturnCode readahead(ssize_t count);
00301 
00302     //
00303     // This will be reading data, and needs to return EOF, etc
00304     //
00305     ReturnCode sync() {
00306         // my internal data is now bad, so we'll scan ahead seeing
00307         // if we can find a good header
00308         clear();
00309         PeekaheadBuffer::ReturnCode rc;
00310         while((rc = m_fileReader.readahead(sizeof(BGZFHeader)))==ok ) {
00311             BGZFHeader *header;
00312             if(rc==endOfFile) return rc;
00313             // a rc==reSync is ok provided readahead still ensures that header is present
00314             void *src = &(*(m_fileReader.begin())) + m_fileReader.startPosition();
00315             header = (BGZFHeader *) src;
00316             if(header->sane()) {
00317                 if(debug) std::cerr << "BGZFReader::sync returning reSync\n";
00318                 return reSync;  // tell caller they need to sync up
00319             }
00320             // consume a byte, then see if we're at a valid block header
00321             uint8_t throwAwayBuffer;
00322             rc = m_fileReader.read(&throwAwayBuffer, 1);
00323         }
00324         return rc;
00325     }
00326     FILE *stream() {return m_fileReader.stream();}
00327 
00328     bool eof() {return dataRemaining()==0 && m_fileReader.eof();}
00329 
00330 };
00331 
00332 PeekaheadBuffer::ReturnCode BGZFReader::readahead(ssize_t count)
00333 {
00334     BGZFHeader header;
00335     // size of inflateBuffer can be determined from ISIZE, I think
00336     uint8_t inflateBuffer[64*1024];
00337     uint8_t gzipBuffer[64*1024+1];
00338 
00339     while(dataRemaining() < count) {
00340         static int loopCount = 0;
00341 
00342         if(debug) std::cerr << "BGZFReader::readahead loopcount = " << loopCount++ << "\n";
00343 
00344         // here we actually read data:
00345         //  read what should be the header
00346         //  verify the header
00347         //  read the remainder of the block
00348         //  check the CRC validity or perhaps just call unzip
00349         //
00350         // XXX the sizeof(header) is wrong:
00351         PeekaheadBuffer::ReturnCode rc = m_fileReader.read((uint8_t *) (&header), sizeof(header));
00352 
00353         if(rc == endOfFile) {
00354             return endOfFile;
00355         }
00356 
00357         // if we have a bad header, start looking forward for a good one,
00358         if(!header.sane()) {
00359             // sync does not consume the next good header, it simply syncs()
00360             // the data stream to the next believed good BGZF header:
00361             if(debug) std::cerr << "BGZFReader::readahead found corrupt BGZF header - now calling sync()\n";
00362             rc = sync();
00363             //
00364             // even though we can now decompress, we need to tell the caller
00365             // what is up before they call for more data (caller needs to
00366             // sync its own record stream):
00367             return rc;
00368         }
00369 
00370         // Read the remainder of the block.
00371         // BSIZE is size of the entire block - 1, so compensate.
00372         rc = m_fileReader.read((uint8_t *) &gzipBuffer, header.BSIZE() + 1 - sizeof(header));
00373 
00374         if(rc == reSync) {
00375             if(debug) std::cerr << "BGZFReader::readahead got incomplete BGZF read - now calling sync()\n";
00376             sync();
00377             return reSync;
00378         }
00379 
00380         //
00381         // we read a header, but our attempt to read more data ended early,
00382         // so best to just return EOF
00383         //
00384         if(rc == endOfFile) {
00385             return rc;
00386         }
00387 
00388         PeekaheadBuffer::ReturnCode bgzf_rc = ok;
00389         // zs.opaque is set when zalloc is NULL
00390         //
00391         // NB: zlib inflateInit2() has valgrind errors
00392         // in versions <1.2.4 - those can be ignored.
00393         //
00394         z_stream zs;
00395         zs.zalloc = NULL;
00396         zs.zfree = NULL;
00397         zs.next_in = gzipBuffer;
00398         zs.avail_in = header.BSIZE() - 16;  // XXX need to check docs for inflate
00399         zs.next_out = inflateBuffer;
00400         zs.avail_out = sizeof(inflateBuffer);
00401 
00402         // -15 --> raw inflate - don't look for gzip or zlib header
00403         // This can be optimized - inflateInit2 does a malloc of
00404         // approximately 10K (sizeof(inflate_state))
00405         if(inflateInit2(&zs, -15) != Z_OK) {
00406             bgzf_rc = reSync;
00407             if(debug) std::cerr << "BGZFReader::readahead - inflateInit2 failed (out of memory?)\n";
00408             // XXX fatal?
00409         }
00410         if(bgzf_rc==ok && inflate(&zs, Z_FINISH) != Z_STREAM_END) {
00411             bgzf_rc = reSync;
00412             if(debug) std::cerr << "BGZFReader::readahead - inflate failed (bad data), calling sync()\n";
00413         }
00414 
00415         if(bgzf_rc == ok) {
00416             if(inflateEnd(&zs) == Z_OK) {
00417                 // do something with zs.total_out
00418                 if(debug) std::cout << "hey, got data!  zs.total_out == " << zs.total_out << "\n";
00419 
00420                 // append the newly decompressed data
00421                 insert(end(), &inflateBuffer[0], &inflateBuffer[0] + zs.total_out);
00422             } else {
00423                 // seems exceptionall unlikely, but check this error case too
00424                 bgzf_rc = reSync;
00425                 if(debug) std::cerr << "BGZFReader::readahead - inflateInit2 failed (out of memory?)\n";
00426                 // XXX fatal?
00427             }
00428         }
00429 
00430         if(bgzf_rc != ok) {
00431             inflateEnd(&zs);
00432             sync();
00433             return bgzf_rc;
00434         }
00435 
00436         // may need to get more data - loop back till all is complete
00437     }
00438 
00439     return ok;
00440 
00441 }
00442 
00443 
00444 #if 0
00445 void testBGZFBuffer()
00446 {
00447     BGZFReader b(stdin);
00448     std::vector<uint8_t>::iterator position;
00449     BGZFReader::ReturnCode rc;
00450 
00451     std::cout << "size = " << b.dataRemaining() << "\n";
00452 
00453     //
00454     // this should:
00455     //  decompress a BGZF block, populating the buffer with
00456     //  unzipped data, possibly returning a BGZFBuffer::ReturnCode of 
00457     //  resync if it turns out the BGZF data was interrupted by bad
00458     //  CRC checks.
00459     //
00460     rc = b.readahead(64);
00461     std::cout << "rc = " << rc << " - expect ok (1)\n";
00462     std::cout << "size (expect 64) = " << b.size() << "\n";
00463 }
00464 
00465 
00466 int main(int argc, const char **argv)
00467 {
00468     testBGZFBuffer();
00469 }
00470 #endif
00471 
00472 
00473 
00474 int BgzfFileTypeRecovery::close()
00475 {
00476     if(bgzfReader) delete bgzfReader;
00477     bgzfReader = NULL;
00478     return true;
00479 }
00480 
00481 
00482 BgzfFileTypeRecovery::BgzfFileTypeRecovery(const char * filename, const char * mode)
00483 {
00484     if(tolower(mode[0])=='r') {
00485         FILE *f = fopen(filename,"r");
00486         bgzfReader = new BGZFReader(f);
00487     } else {
00488         // die for now
00489         if(debug) std::cerr << "Unable to open " << filename << " in mode " << mode << ".\n";
00490         close();
00491     }
00492 }
00493 
00494 //
00495 // Why is this ever called?
00496 //
00497 bool BgzfFileTypeRecovery::operator == (void * rhs)
00498 {
00499     throw std::logic_error("BgzfFileTypeRecovery::operator == is dangerous - do not use");
00500     return false;
00501 }
00502 
00503 bool BgzfFileTypeRecovery::operator != (void * rhs)
00504 {
00505     throw std::logic_error("BgzfFileTypeRecovery::operator != is dangerous - do not use");
00506     return false;
00507 }
00508 
00509 int BgzfFileTypeRecovery::eof()
00510 {
00511     return bgzfReader->eof();
00512 }
00513 
00514 unsigned int BgzfFileTypeRecovery::write(const void * buffer, unsigned int size)
00515 {
00516     // currently unsupported
00517     return 0;
00518 }
00519 
00520 int BgzfFileTypeRecovery::read(void * buffer, unsigned int size)
00521 {
00522 
00523     if(bgzfReader == NULL) {
00524         return 0;
00525     }
00526 
00527     PeekaheadBuffer::ReturnCode rc = bgzfReader->read((uint8_t *) buffer, size);
00528    //     endOfFile = -1,
00529 //        reSync = 0,
00530 //        ok = 1
00531     switch(rc) {
00532         case PeekaheadBuffer::endOfFile:
00533             // set a flag?
00534             return 0;
00535         case PeekaheadBuffer::reSync:
00536             // we could encode more info in the exception message here:
00537             if(debug) std::cerr << "throwing BGZF sync exception\n";
00538             throw std::runtime_error("BGZF stream resync");
00539         case PeekaheadBuffer::ok:
00540             //
00541             // in bgzfReader, we always are ensured we
00542             // get the full amount of the read, otherwise
00543             // an error is thrown.
00544             //
00545             return size;
00546     }
00547     // NOTREACHED
00548     return 0;
00549 }
00550 
00551 int64_t BgzfFileTypeRecovery::tell()
00552 {
00553     // currently unsupported
00554     return 0;
00555 }
00556 
00557 bool BgzfFileTypeRecovery::seek(int64_t offset, int origin)
00558 {
00559     // currently unsupported
00560     return 0;
00561 }
00562 
00563 
00564 bool BgzfFileTypeRecovery::attemptRecoverySync(bool (*checkSignature)(void *data) , int length)
00565 {
00566     //
00567     // creep along a byte at a time, checking for signature.
00568     //
00569     // possibly slow.  should only need to scan ahead < 64K bytes
00570     // or so, however, so should recover in "reasonable" time.
00571     //
00572     while( bgzfReader->readahead(length) == PeekaheadBuffer::ok) {
00573         char ch;
00574         void *src = &(*(bgzfReader->begin())) + bgzfReader->startPosition();
00575 
00576         //
00577         // readahead ensures we have 'length' bytes of
00578         // data to check that is valid in the buffer.
00579         //
00580         if((*checkSignature)(src)) return true;
00581         PeekaheadBuffer::ReturnCode rc  = bgzfReader->read((uint8_t *) &ch, 1);
00582         if(rc!=PeekaheadBuffer::ok) return false;
00583         // we consumed a byte, so go back to top of loop,
00584         // resume filling buffer (if need be) and re-check
00585     }
00586 
00587 
00588     return false;
00589 }
00590