BgzfFileTypeRecovery.cpp
00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #include "BgzfFileTypeRecovery.h"
00019
00020 #include <stdio.h>
00021 #include <stdint.h>
00022
00023 #include <sys/types.h>
00024 #include <sys/stat.h>
00025 #include <fcntl.h>
00026 #include <stdlib.h>
00027 #include <string.h>
00028 #include <unistd.h>
00029 #include <zlib.h>
00030
00031 #include <fstream>
00032 #include <iostream>
00033 #include <stdexcept>
00034 #include <vector>
00035
00036 #pragma pack(push)
00037 #pragma pack(1)
00038
00039 #define debug false
00040
00041 class GzipHeader {
00042 private:
00043 uint8_t m_ID1;
00044 uint8_t m_ID2;
00045 uint8_t m_CM;
00046 uint8_t m_FLG;
00047 uint32_t m_MTIME;
00048 uint8_t m_XFL;
00049 uint8_t m_OS;
00050 uint16_t m_XLEN;
00051 public:
00052 GzipHeader() {;}
00053
00054 void defaults() {
00055 m_ID1 = 31;
00056 m_ID2 = 139;
00057 m_CM = 8;
00058 m_FLG = 4;
00059 m_MTIME = 0;
00060 m_XFL = 0;
00061 m_OS = 255;
00062 m_XLEN = 6;
00063 }
00064 uint8_t ID1() {return m_ID1;}
00065 uint8_t ID2() {return m_ID2;}
00066 uint8_t CM() {return m_CM;}
00067 uint8_t FLG() {return m_FLG;}
00068 uint32_t MTIME() {return m_MTIME;}
00069 uint8_t XFL() {return m_XFL;}
00070 uint8_t OS() {return m_OS;}
00071 uint16_t XLEN() {return m_XLEN;}
00072 bool sane() {
00073 return (m_ID1==31 && m_ID2==139 && m_CM==8 && m_FLG==4 && m_MTIME==0 && m_XFL == 0 && m_OS == 255 && m_XLEN==6);
00074 }
00075 };
00076
00077 class BGZFHeader : public GzipHeader {
00078 private:
00079 uint8_t m_SI1;
00080 uint8_t m_SI2;
00081 uint16_t m_SLEN;
00082 uint16_t m_BSIZE;
00083 public:
00084 BGZFHeader(
00085 uint8_t m_SI1 = 'B',
00086 uint8_t m_SI2 = 'C',
00087 uint16_t m_SLEN = 2,
00088 uint16_t m_BSIZE = 0
00089 ) : m_SI1(m_SI1), m_SI2(m_SI2), m_SLEN(m_SLEN), m_BSIZE(m_BSIZE) {;}
00090 uint8_t SI1() {return m_SI1;}
00091 uint8_t SI2() {return m_SI2;}
00092 uint16_t SLEN() {return m_SLEN;}
00093 uint16_t BSIZE() {return m_BSIZE;}
00094 bool sane() {
00095 return GzipHeader::sane() &&
00096 (m_SI1=='B' && m_SI2=='C' && m_SLEN==2 && m_BSIZE > sizeof(BGZFHeader));
00097 }
00098 };
00099
00100 #pragma pack(pop)
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127 class PeekaheadBuffer : public std::vector<uint8_t> {
00128
00129 protected:
00130 ssize_t m_startPosition;
00131
00132 public:
00133 enum ReturnCode {
00134 endOfFile = -1,
00135 reSync = 0,
00136 ok = 1
00137 };
00138
00139 ssize_t startPosition() {return m_startPosition;}
00140
00141 private:
00142
00143
00144
00145
00146
00147
00148 void shiftData() {
00149 if(dataRemaining() < (ssize_t) (std::vector<uint8_t>::size() / 8) ) {
00150 erase(begin(), begin() + m_startPosition);
00151 m_startPosition = 0;
00152 }
00153 }
00154
00155
00156 virtual ReturnCode sync();
00157 public:
00158 PeekaheadBuffer();
00159 virtual ~PeekaheadBuffer();
00160
00161
00162 ssize_t dataRemaining();
00163
00164
00165
00166
00167
00168
00169
00170
00171
00172
00173
00174
00175 virtual ReturnCode readahead(ssize_t count) = 0;
00176
00177
00178
00179 ReturnCode read(uint8_t *buffer, ssize_t count) {
00180 ReturnCode rc;
00181
00182 rc = readahead(count);
00183
00184 if(rc == ok) {
00185 uint8_t *src = &(*begin()) + m_startPosition;
00186 uint8_t *dest = buffer;
00187
00188 memcpy(dest, src, count);
00189
00190 m_startPosition += count;
00191
00192
00193 shiftData();
00194 } else if(rc == reSync) {
00195
00196 return reSync;
00197 } else {
00198
00199 return endOfFile;
00200 }
00201
00202 return ok;
00203 }
00204
00205 };
00206
00207 PeekaheadBuffer::PeekaheadBuffer() : m_startPosition(0)
00208 {
00209 }
00210
00211 PeekaheadBuffer::~PeekaheadBuffer()
00212 {
00213 }
00214
00215 PeekaheadBuffer::ReturnCode PeekaheadBuffer::sync() {
00216 clear();
00217 return ok;
00218 }
00219
00220 ssize_t PeekaheadBuffer::dataRemaining()
00221 {
00222 return std::vector<uint8_t>::size() - m_startPosition;
00223 }
00224
00225
00226
00227 class FileReader : public PeekaheadBuffer {
00228 FILE *m_stream;
00229 public:
00230 FileReader();
00231 ~FileReader();
00232 FileReader(FILE *stream);
00233 PeekaheadBuffer::ReturnCode readahead(ssize_t count);
00234 FILE *stream() {return m_stream;}
00235 bool eof() {return m_stream ? feof(m_stream) : false;}
00236 };
00237
00238 FileReader::FileReader()
00239 {
00240 m_stream = NULL;
00241 }
00242
00243 FileReader::FileReader(FILE *stream) : m_stream(stream)
00244 {
00245 }
00246
00247 FileReader::~FileReader()
00248 {
00249 fclose(m_stream);
00250 m_stream = NULL;
00251 }
00252
00253
00254
00255
00256
00257
00258
00259 PeekaheadBuffer::ReturnCode FileReader::readahead(ssize_t count)
00260 {
00261 uint8_t buffer[4096];
00262 while(dataRemaining() < count) {
00263 int bytesRead = fread(buffer, 1, sizeof(buffer), m_stream);
00264 if(bytesRead==0) {
00265 if(ferror(m_stream)) {
00266 return reSync;
00267 }
00268
00269 return endOfFile;
00270 }
00271 #if 0
00272 fprintf(stderr, "\n\n");
00273 int possible = -1;
00274 for(int i=0;i<bytesRead;i+=16) {
00275 fprintf(stderr,"%08x: ", i);
00276 for(int j=0;j<16;j++) {
00277 if(buffer[i+j]==31 && buffer[i+j+1]==139) {
00278 possible = i+j;
00279 }
00280 fprintf(stderr,"%02x ", buffer[i+j]);
00281 }
00282 fprintf(stderr, "\n");
00283 }
00284 if(possible>0) {
00285 fprintf(stderr,"possible signature at %08x\n", possible);
00286 }
00287 #endif
00288 insert(end(), &buffer[0], &buffer[0] + bytesRead);
00289 }
00290 return ok;
00291 }
00292
00293 class BGZFReader : public PeekaheadBuffer {
00294 FileReader m_fileReader;
00295
00296 public:
00297
00298 BGZFReader(FILE *stream) : m_fileReader(stream) {;}
00299
00300 PeekaheadBuffer::ReturnCode readahead(ssize_t count);
00301
00302
00303
00304
00305 ReturnCode sync() {
00306
00307
00308 clear();
00309 PeekaheadBuffer::ReturnCode rc;
00310 while((rc = m_fileReader.readahead(sizeof(BGZFHeader)))==ok ) {
00311 BGZFHeader *header;
00312 if(rc==endOfFile) return rc;
00313
00314 void *src = &(*(m_fileReader.begin())) + m_fileReader.startPosition();
00315 header = (BGZFHeader *) src;
00316 if(header->sane()) {
00317 if(debug) std::cerr << "BGZFReader::sync returning reSync\n";
00318 return reSync;
00319 }
00320
00321 uint8_t throwAwayBuffer;
00322 rc = m_fileReader.read(&throwAwayBuffer, 1);
00323 }
00324 return rc;
00325 }
00326 FILE *stream() {return m_fileReader.stream();}
00327
00328 bool eof() {return dataRemaining()==0 && m_fileReader.eof();}
00329
00330 };
00331
00332 PeekaheadBuffer::ReturnCode BGZFReader::readahead(ssize_t count)
00333 {
00334 BGZFHeader header;
00335
00336 uint8_t inflateBuffer[64*1024];
00337 uint8_t gzipBuffer[64*1024+1];
00338
00339 while(dataRemaining() < count) {
00340 static int loopCount = 0;
00341
00342 if(debug) std::cerr << "BGZFReader::readahead loopcount = " << loopCount++ << "\n";
00343
00344
00345
00346
00347
00348
00349
00350
00351 PeekaheadBuffer::ReturnCode rc = m_fileReader.read((uint8_t *) (&header), sizeof(header));
00352
00353 if(rc == endOfFile) {
00354 return endOfFile;
00355 }
00356
00357
00358 if(!header.sane()) {
00359
00360
00361 if(debug) std::cerr << "BGZFReader::readahead found corrupt BGZF header - now calling sync()\n";
00362 rc = sync();
00363
00364
00365
00366
00367 return rc;
00368 }
00369
00370
00371
00372 rc = m_fileReader.read((uint8_t *) &gzipBuffer, header.BSIZE() + 1 - sizeof(header));
00373
00374 if(rc == reSync) {
00375 if(debug) std::cerr << "BGZFReader::readahead got incomplete BGZF read - now calling sync()\n";
00376 sync();
00377 return reSync;
00378 }
00379
00380
00381
00382
00383
00384 if(rc == endOfFile) {
00385 return rc;
00386 }
00387
00388 PeekaheadBuffer::ReturnCode bgzf_rc = ok;
00389
00390
00391
00392
00393
00394 z_stream zs;
00395 zs.zalloc = NULL;
00396 zs.zfree = NULL;
00397 zs.next_in = gzipBuffer;
00398 zs.avail_in = header.BSIZE() - 16;
00399 zs.next_out = inflateBuffer;
00400 zs.avail_out = sizeof(inflateBuffer);
00401
00402
00403
00404
00405 if(inflateInit2(&zs, -15) != Z_OK) {
00406 bgzf_rc = reSync;
00407 if(debug) std::cerr << "BGZFReader::readahead - inflateInit2 failed (out of memory?)\n";
00408
00409 }
00410 if(bgzf_rc==ok && inflate(&zs, Z_FINISH) != Z_STREAM_END) {
00411 bgzf_rc = reSync;
00412 if(debug) std::cerr << "BGZFReader::readahead - inflate failed (bad data), calling sync()\n";
00413 }
00414
00415 if(bgzf_rc == ok) {
00416 if(inflateEnd(&zs) == Z_OK) {
00417
00418 if(debug) std::cout << "hey, got data! zs.total_out == " << zs.total_out << "\n";
00419
00420
00421 insert(end(), &inflateBuffer[0], &inflateBuffer[0] + zs.total_out);
00422 } else {
00423
00424 bgzf_rc = reSync;
00425 if(debug) std::cerr << "BGZFReader::readahead - inflateInit2 failed (out of memory?)\n";
00426
00427 }
00428 }
00429
00430 if(bgzf_rc != ok) {
00431 inflateEnd(&zs);
00432 sync();
00433 return bgzf_rc;
00434 }
00435
00436
00437 }
00438
00439 return ok;
00440
00441 }
00442
00443
00444 #if 0
00445 void testBGZFBuffer()
00446 {
00447 BGZFReader b(stdin);
00448 std::vector<uint8_t>::iterator position;
00449 BGZFReader::ReturnCode rc;
00450
00451 std::cout << "size = " << b.dataRemaining() << "\n";
00452
00453
00454
00455
00456
00457
00458
00459
00460 rc = b.readahead(64);
00461 std::cout << "rc = " << rc << " - expect ok (1)\n";
00462 std::cout << "size (expect 64) = " << b.size() << "\n";
00463 }
00464
00465
00466 int main(int argc, const char **argv)
00467 {
00468 testBGZFBuffer();
00469 }
00470 #endif
00471
00472
00473
00474 int BgzfFileTypeRecovery::close()
00475 {
00476 if(bgzfReader) delete bgzfReader;
00477 bgzfReader = NULL;
00478 return true;
00479 }
00480
00481
00482 BgzfFileTypeRecovery::BgzfFileTypeRecovery(const char * filename, const char * mode)
00483 {
00484 if(tolower(mode[0])=='r') {
00485 FILE *f = fopen(filename,"r");
00486 bgzfReader = new BGZFReader(f);
00487 } else {
00488
00489 if(debug) std::cerr << "Unable to open " << filename << " in mode " << mode << ".\n";
00490 close();
00491 }
00492 }
00493
00494
00495
00496
00497 bool BgzfFileTypeRecovery::operator == (void * rhs)
00498 {
00499 throw std::logic_error("BgzfFileTypeRecovery::operator == is dangerous - do not use");
00500 return false;
00501 }
00502
00503 bool BgzfFileTypeRecovery::operator != (void * rhs)
00504 {
00505 throw std::logic_error("BgzfFileTypeRecovery::operator != is dangerous - do not use");
00506 return false;
00507 }
00508
00509 int BgzfFileTypeRecovery::eof()
00510 {
00511 return bgzfReader->eof();
00512 }
00513
00514 unsigned int BgzfFileTypeRecovery::write(const void * buffer, unsigned int size)
00515 {
00516
00517 return 0;
00518 }
00519
00520 int BgzfFileTypeRecovery::read(void * buffer, unsigned int size)
00521 {
00522
00523 if(bgzfReader == NULL) {
00524 return 0;
00525 }
00526
00527 PeekaheadBuffer::ReturnCode rc = bgzfReader->read((uint8_t *) buffer, size);
00528
00529
00530
00531 switch(rc) {
00532 case PeekaheadBuffer::endOfFile:
00533
00534 return 0;
00535 case PeekaheadBuffer::reSync:
00536
00537 if(debug) std::cerr << "throwing BGZF sync exception\n";
00538 throw std::runtime_error("BGZF stream resync");
00539 case PeekaheadBuffer::ok:
00540
00541
00542
00543
00544
00545 return size;
00546 }
00547
00548 return 0;
00549 }
00550
00551 int64_t BgzfFileTypeRecovery::tell()
00552 {
00553
00554 return 0;
00555 }
00556
00557 bool BgzfFileTypeRecovery::seek(int64_t offset, int origin)
00558 {
00559
00560 return 0;
00561 }
00562
00563
00564 bool BgzfFileTypeRecovery::attemptRecoverySync(bool (*checkSignature)(void *data) , int length)
00565 {
00566
00567
00568
00569
00570
00571
00572 while( bgzfReader->readahead(length) == PeekaheadBuffer::ok) {
00573 char ch;
00574 void *src = &(*(bgzfReader->begin())) + bgzfReader->startPosition();
00575
00576
00577
00578
00579
00580 if((*checkSignature)(src)) return true;
00581 PeekaheadBuffer::ReturnCode rc = bgzfReader->read((uint8_t *) &ch, 1);
00582 if(rc!=PeekaheadBuffer::ok) return false;
00583
00584
00585 }
00586
00587
00588 return false;
00589 }
00590