00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #ifdef __ZLIB_AVAILABLE__
00019
00020 #include "BgzfFileTypeRecovery.h"
00021
00022 #include <stdio.h>
00023 #include <stdint.h>
00024
00025 #include <sys/types.h>
00026 #include <sys/stat.h>
00027 #include <fcntl.h>
00028 #include <stdlib.h>
00029 #include <string.h>
00030 #include <unistd.h>
00031 #include <zlib.h>
00032
00033 #include <fstream>
00034 #include <iostream>
00035 #include <stdexcept>
00036 #include <vector>
00037
00038 #pragma pack(push,1)
00039
00040 #define debug false
00041
00042 class RecoveryGzipHeader {
00043 private:
00044 uint8_t m_ID1;
00045 uint8_t m_ID2;
00046 uint8_t m_CM;
00047 uint8_t m_FLG;
00048 uint32_t m_MTIME;
00049 uint8_t m_XFL;
00050 uint8_t m_OS;
00051 uint16_t m_XLEN;
00052 public:
00053 RecoveryGzipHeader() :
00054 m_ID1(0),
00055 m_ID2(0),
00056 m_CM(0),
00057 m_FLG(0),
00058 m_MTIME(0),
00059 m_XFL(0),
00060 m_OS(0),
00061 m_XLEN(0)
00062 {;}
00063
00064 void defaults() {
00065 m_ID1 = 31;
00066 m_ID2 = 139;
00067 m_CM = 8;
00068 m_FLG = 4;
00069 m_MTIME = 0;
00070 m_XFL = 0;
00071 m_OS = 255;
00072 m_XLEN = 6;
00073 }
00074 uint8_t ID1() {return m_ID1;}
00075 uint8_t ID2() {return m_ID2;}
00076 uint8_t CM() {return m_CM;}
00077 uint8_t FLG() {return m_FLG;}
00078 uint32_t MTIME() {return m_MTIME;}
00079 uint8_t XFL() {return m_XFL;}
00080 uint8_t OS() {return m_OS;}
00081 uint16_t XLEN() {return m_XLEN;}
00082 bool sane() {
00083 return (m_ID1==31 && m_ID2==139 && m_CM==8 && m_FLG==4 && m_MTIME==0 && m_XFL == 0 && m_OS == 255 && m_XLEN==6);
00084 }
00085 };
00086
00087 class BGZFHeader : public RecoveryGzipHeader {
00088 private:
00089 uint8_t m_SI1;
00090 uint8_t m_SI2;
00091 uint16_t m_SLEN;
00092 uint16_t m_BSIZE;
00093 public:
00094 BGZFHeader(
00095 uint8_t m_SI1 = 'B',
00096 uint8_t m_SI2 = 'C',
00097 uint16_t m_SLEN = 2,
00098 uint16_t m_BSIZE = 0
00099 ) : m_SI1(m_SI1), m_SI2(m_SI2), m_SLEN(m_SLEN), m_BSIZE(m_BSIZE) {;}
00100 uint8_t SI1() {return m_SI1;}
00101 uint8_t SI2() {return m_SI2;}
00102 uint16_t SLEN() {return m_SLEN;}
00103 uint16_t BSIZE() {return m_BSIZE;}
00104 bool sane() {
00105 return RecoveryGzipHeader::sane() &&
00106 (m_SI1=='B' && m_SI2=='C' && m_SLEN==2 && m_BSIZE > sizeof(BGZFHeader));
00107 }
00108 };
00109
00110 #pragma pack(pop)
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136
00137 class PeekaheadBuffer : public std::vector<uint8_t> {
00138
00139 protected:
00140 ssize_t m_startPosition;
00141
00142 public:
00143 enum ReturnCode {
00144 endOfFile = -1,
00145 reSync = 0,
00146 ok = 1
00147 };
00148
00149 ssize_t startPosition() {return m_startPosition;}
00150
00151 private:
00152
00153
00154
00155
00156
00157
00158 void shiftData() {
00159 if(dataRemaining() < (ssize_t) (std::vector<uint8_t>::size() / 8) ) {
00160 erase(begin(), begin() + m_startPosition);
00161 m_startPosition = 0;
00162 }
00163 }
00164
00165
00166 virtual ReturnCode sync();
00167 public:
00168 PeekaheadBuffer();
00169 virtual ~PeekaheadBuffer();
00170
00171
00172 ssize_t dataRemaining();
00173
00174
00175
00176
00177
00178
00179
00180
00181
00182
00183
00184
00185 virtual ReturnCode readahead(ssize_t count) = 0;
00186
00187
00188
00189 ReturnCode read(uint8_t *buffer, ssize_t count) {
00190 ReturnCode rc;
00191
00192 rc = readahead(count);
00193
00194 if(rc == ok) {
00195 uint8_t *src = &(*begin()) + m_startPosition;
00196 uint8_t *dest = buffer;
00197
00198 memcpy(dest, src, count);
00199
00200 m_startPosition += count;
00201
00202
00203 shiftData();
00204 } else if(rc == reSync) {
00205
00206 return reSync;
00207 } else {
00208
00209 return endOfFile;
00210 }
00211
00212 return ok;
00213 }
00214
00215 };
00216
00217 PeekaheadBuffer::PeekaheadBuffer() : m_startPosition(0)
00218 {
00219 }
00220
00221 PeekaheadBuffer::~PeekaheadBuffer()
00222 {
00223 }
00224
00225 PeekaheadBuffer::ReturnCode PeekaheadBuffer::sync() {
00226 clear();
00227 return ok;
00228 }
00229
00230 ssize_t PeekaheadBuffer::dataRemaining()
00231 {
00232 return std::vector<uint8_t>::size() - m_startPosition;
00233 }
00234
00235
00236
00237 class FileReader : public PeekaheadBuffer {
00238 FILE *m_stream;
00239 public:
00240 FileReader();
00241 ~FileReader();
00242 FileReader(FILE *stream);
00243 PeekaheadBuffer::ReturnCode readahead(ssize_t count);
00244 FILE *stream() {return m_stream;}
00245 bool eof() {return m_stream ? feof(m_stream) : false;}
00246 };
00247
00248 FileReader::FileReader()
00249 {
00250 m_stream = NULL;
00251 }
00252
00253 FileReader::FileReader(FILE *stream) : m_stream(stream)
00254 {
00255 }
00256
00257 FileReader::~FileReader()
00258 {
00259 fclose(m_stream);
00260 m_stream = NULL;
00261 }
00262
00263
00264
00265
00266
00267
00268
00269 PeekaheadBuffer::ReturnCode FileReader::readahead(ssize_t count)
00270 {
00271 uint8_t buffer[4096];
00272 while(dataRemaining() < count) {
00273 int bytesRead = fread(buffer, 1, sizeof(buffer), m_stream);
00274 if(bytesRead==0) {
00275 if(ferror(m_stream)) {
00276 return reSync;
00277 }
00278
00279 return endOfFile;
00280 }
00281 #if 0
00282 fprintf(stderr, "\n\n");
00283 int possible = -1;
00284 for(int i=0;i<bytesRead;i+=16) {
00285 fprintf(stderr,"%08x: ", i);
00286 for(int j=0;j<16;j++) {
00287 if(buffer[i+j]==31 && buffer[i+j+1]==139) {
00288 possible = i+j;
00289 }
00290 fprintf(stderr,"%02x ", buffer[i+j]);
00291 }
00292 fprintf(stderr, "\n");
00293 }
00294 if(possible>0) {
00295 fprintf(stderr,"possible signature at %08x\n", possible);
00296 }
00297 #endif
00298 insert(end(), &buffer[0], &buffer[0] + bytesRead);
00299 }
00300 return ok;
00301 }
00302
00303 class BGZFReader : public PeekaheadBuffer {
00304 FileReader m_fileReader;
00305
00306 public:
00307
00308 BGZFReader(FILE *stream) : m_fileReader(stream) {;}
00309
00310 PeekaheadBuffer::ReturnCode readahead(ssize_t count);
00311
00312
00313
00314
00315 ReturnCode sync() {
00316
00317
00318 clear();
00319 PeekaheadBuffer::ReturnCode rc;
00320 while((rc = m_fileReader.readahead(sizeof(BGZFHeader)))==ok ) {
00321 BGZFHeader *header;
00322 if(rc==endOfFile) return rc;
00323
00324 void *src = &(*(m_fileReader.begin())) + m_fileReader.startPosition();
00325 header = (BGZFHeader *) src;
00326 if(header->sane()) {
00327 if(debug) std::cerr << "BGZFReader::sync returning reSync\n";
00328 return reSync;
00329 }
00330
00331 uint8_t throwAwayBuffer;
00332 rc = m_fileReader.read(&throwAwayBuffer, 1);
00333 }
00334 return rc;
00335 }
00336 FILE *stream() {return m_fileReader.stream();}
00337
00338 bool eof() {return dataRemaining()==0 && m_fileReader.eof();}
00339
00340 };
00341
00342 PeekaheadBuffer::ReturnCode BGZFReader::readahead(ssize_t count)
00343 {
00344 BGZFHeader header;
00345
00346 uint8_t inflateBuffer[64*1024];
00347 uint8_t gzipBuffer[64*1024+1];
00348
00349 while(dataRemaining() < count) {
00350 static int loopCount = 0;
00351
00352 if(debug) std::cerr << "BGZFReader::readahead loopcount = " << loopCount++ << "\n";
00353
00354
00355
00356
00357
00358
00359
00360
00361 PeekaheadBuffer::ReturnCode rc = m_fileReader.read((uint8_t *) (&header), sizeof(header));
00362
00363 if(rc == endOfFile) {
00364 return endOfFile;
00365 }
00366
00367
00368 if(!header.sane()) {
00369
00370
00371 if(debug) std::cerr << "BGZFReader::readahead found corrupt BGZF header - now calling sync()\n";
00372 rc = sync();
00373
00374
00375
00376
00377 return rc;
00378 }
00379
00380
00381
00382 rc = m_fileReader.read((uint8_t *) &gzipBuffer, header.BSIZE() + 1 - sizeof(header));
00383
00384 if(rc == reSync) {
00385 if(debug) std::cerr << "BGZFReader::readahead got incomplete BGZF read - now calling sync()\n";
00386 sync();
00387 return reSync;
00388 }
00389
00390
00391
00392
00393
00394 if(rc == endOfFile) {
00395 return rc;
00396 }
00397
00398 PeekaheadBuffer::ReturnCode bgzf_rc = ok;
00399
00400
00401
00402
00403
00404 z_stream zs;
00405 zs.zalloc = NULL;
00406 zs.zfree = NULL;
00407 zs.next_in = gzipBuffer;
00408 zs.avail_in = header.BSIZE() - 16;
00409 zs.next_out = inflateBuffer;
00410 zs.avail_out = sizeof(inflateBuffer);
00411
00412
00413
00414
00415 if(inflateInit2(&zs, -15) != Z_OK) {
00416 bgzf_rc = reSync;
00417 if(debug) std::cerr << "BGZFReader::readahead - inflateInit2 failed (out of memory?)\n";
00418
00419 }
00420 if(bgzf_rc==ok && inflate(&zs, Z_FINISH) != Z_STREAM_END) {
00421 bgzf_rc = reSync;
00422 if(debug) std::cerr << "BGZFReader::readahead - inflate failed (bad data), calling sync()\n";
00423 }
00424
00425 if(bgzf_rc == ok) {
00426 if(inflateEnd(&zs) == Z_OK) {
00427
00428 if(debug) std::cout << "hey, got data! zs.total_out == " << zs.total_out << "\n";
00429
00430
00431 insert(end(), &inflateBuffer[0], &inflateBuffer[0] + zs.total_out);
00432 } else {
00433
00434 bgzf_rc = reSync;
00435 if(debug) std::cerr << "BGZFReader::readahead - inflateInit2 failed (out of memory?)\n";
00436
00437 }
00438 }
00439
00440 if(bgzf_rc != ok) {
00441 inflateEnd(&zs);
00442 sync();
00443 return bgzf_rc;
00444 }
00445
00446
00447 }
00448
00449 return ok;
00450
00451 }
00452
00453
00454 #if 0
00455 void testBGZFBuffer()
00456 {
00457 BGZFReader b(stdin);
00458 std::vector<uint8_t>::iterator position;
00459 BGZFReader::ReturnCode rc;
00460
00461 std::cout << "size = " << b.dataRemaining() << "\n";
00462
00463
00464
00465
00466
00467
00468
00469
00470 rc = b.readahead(64);
00471 std::cout << "rc = " << rc << " - expect ok (1)\n";
00472 std::cout << "size (expect 64) = " << b.size() << "\n";
00473 }
00474
00475
00476 int main(int argc, const char **argv)
00477 {
00478 testBGZFBuffer();
00479 }
00480 #endif
00481
00482
00483
00484 int BgzfFileTypeRecovery::close()
00485 {
00486 if(bgzfReader) delete bgzfReader;
00487 bgzfReader = NULL;
00488 return true;
00489 }
00490
00491
00492 BgzfFileTypeRecovery::BgzfFileTypeRecovery(const char * filename, const char * mode)
00493 {
00494 if(tolower(mode[0])=='r') {
00495 FILE *f = fopen(filename,"r");
00496 bgzfReader = new BGZFReader(f);
00497 } else {
00498
00499 if(debug) std::cerr << "Unable to open " << filename << " in mode " << mode << ".\n";
00500 close();
00501 }
00502 }
00503
00504
00505
00506
00507 bool BgzfFileTypeRecovery::operator == (void * rhs)
00508 {
00509 throw std::logic_error("BgzfFileTypeRecovery::operator == is dangerous - do not use");
00510 return false;
00511 }
00512
00513 bool BgzfFileTypeRecovery::operator != (void * rhs)
00514 {
00515 throw std::logic_error("BgzfFileTypeRecovery::operator != is dangerous - do not use");
00516 return false;
00517 }
00518
00519 int BgzfFileTypeRecovery::eof()
00520 {
00521 return bgzfReader->eof();
00522 }
00523
00524 unsigned int BgzfFileTypeRecovery::write(const void * buffer, unsigned int size)
00525 {
00526
00527 return 0;
00528 }
00529
00530 int BgzfFileTypeRecovery::read(void * buffer, unsigned int size)
00531 {
00532
00533 if(bgzfReader == NULL) {
00534 return 0;
00535 }
00536
00537 PeekaheadBuffer::ReturnCode rc = bgzfReader->read((uint8_t *) buffer, size);
00538
00539
00540
00541 switch(rc) {
00542 case PeekaheadBuffer::endOfFile:
00543
00544 return 0;
00545 case PeekaheadBuffer::reSync:
00546
00547 if(debug) std::cerr << "throwing BGZF sync exception\n";
00548 throw std::runtime_error("BGZF stream resync");
00549 case PeekaheadBuffer::ok:
00550
00551
00552
00553
00554
00555 return size;
00556 }
00557
00558 return 0;
00559 }
00560
00561 int64_t BgzfFileTypeRecovery::tell()
00562 {
00563
00564 return 0;
00565 }
00566
00567 bool BgzfFileTypeRecovery::seek(int64_t offset, int origin)
00568 {
00569
00570 return 0;
00571 }
00572
00573
00574 bool BgzfFileTypeRecovery::attemptRecoverySync(bool (*checkSignature)(void *data) , int length)
00575 {
00576
00577
00578
00579
00580
00581
00582 while( bgzfReader->readahead(length) == PeekaheadBuffer::ok) {
00583 char ch;
00584 void *src = &(*(bgzfReader->begin())) + bgzfReader->startPosition();
00585
00586
00587
00588
00589
00590 if((*checkSignature)(src)) return true;
00591 PeekaheadBuffer::ReturnCode rc = bgzfReader->read((uint8_t *) &ch, 1);
00592 if(rc!=PeekaheadBuffer::ok) return false;
00593
00594
00595 }
00596
00597
00598 return false;
00599 }
00600
00601 #endif