libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #ifdef __ZLIB_AVAILABLE__ 00019 00020 #include "BgzfFileTypeRecovery.h" 00021 00022 #include <stdio.h> 00023 #include <stdint.h> 00024 00025 #include <sys/types.h> 00026 #include <sys/stat.h> 00027 #include <fcntl.h> 00028 #include <stdlib.h> 00029 #include <string.h> 00030 #include <zlib.h> 00031 00032 #include <fstream> 00033 #include <iostream> 00034 #include <stdexcept> 00035 #include <vector> 00036 00037 #pragma pack(push,1) 00038 00039 #define debug false 00040 00041 class RecoveryGzipHeader { 00042 private: 00043 uint8_t m_ID1; 00044 uint8_t m_ID2; 00045 uint8_t m_CM; 00046 uint8_t m_FLG; 00047 uint32_t m_MTIME; 00048 uint8_t m_XFL; 00049 uint8_t m_OS; 00050 uint16_t m_XLEN; 00051 public: 00052 RecoveryGzipHeader() : 00053 m_ID1(0), 00054 m_ID2(0), 00055 m_CM(0), 00056 m_FLG(0), 00057 m_MTIME(0), 00058 m_XFL(0), 00059 m_OS(0), 00060 m_XLEN(0) 00061 {;} 00062 00063 void defaults() { 00064 m_ID1 = 31; 00065 m_ID2 = 139; 00066 m_CM = 8; 00067 m_FLG = 4; 00068 m_MTIME = 0; 00069 m_XFL = 0; 00070 m_OS = 255; 00071 m_XLEN = 6; 00072 } 00073 uint8_t ID1() {return m_ID1;} 00074 uint8_t ID2() {return m_ID2;} 00075 uint8_t CM() {return m_CM;} 00076 uint8_t FLG() {return m_FLG;} 00077 uint32_t MTIME() {return m_MTIME;} 00078 uint8_t XFL() {return m_XFL;} 00079 uint8_t OS() {return m_OS;} 00080 uint16_t XLEN() {return m_XLEN;} 00081 bool sane() { 00082 return (m_ID1==31 && m_ID2==139 && m_CM==8 && m_FLG==4 && m_MTIME==0 && m_XFL == 0 && m_OS == 255 && m_XLEN==6); 00083 } 00084 }; 00085 00086 class BGZFHeader : public RecoveryGzipHeader { 00087 private: 00088 uint8_t m_SI1; 00089 uint8_t m_SI2; 00090 uint16_t m_SLEN; // little endian 00091 uint16_t m_BSIZE; // little endian 00092 public: 00093 BGZFHeader( 00094 uint8_t m_SI1 = 'B', 00095 uint8_t m_SI2 = 'C', 00096 uint16_t m_SLEN = 2, 00097 uint16_t m_BSIZE = 0 00098 ) : m_SI1(m_SI1), m_SI2(m_SI2), m_SLEN(m_SLEN), m_BSIZE(m_BSIZE) {;} 00099 uint8_t SI1() {return m_SI1;} 00100 uint8_t SI2() {return m_SI2;} 00101 uint16_t SLEN() {return m_SLEN;} 00102 uint16_t BSIZE() {return m_BSIZE;} 00103 bool sane() { 00104 return RecoveryGzipHeader::sane() && 00105 (m_SI1=='B' && m_SI2=='C' && m_SLEN==2 && m_BSIZE > sizeof(BGZFHeader)); 00106 } 00107 }; 00108 00109 #pragma pack(pop) 00110 00111 // 00112 // PeekaheadBuffer allows non-destructive peekahead and resyncing 00113 // after read errors when the underlying stream has signatures in the 00114 // data that allow it. 00115 // 00116 // In addition, it has a peek() capability to allow 00117 // the caller to look ahead in the stream to see 00118 // a certain number of bytes before actually consuming them. 00119 // 00120 // The intent is that this class behave as something of a poor 00121 // man's FIFO - with the cost of buffer movement when data is removed. 00122 // 00123 // This is far from ideal, but we basically are moving data around 00124 // when allowing arbitrary peekahead regardless. 00125 // 00126 // The basis for the design is the fact that most read calls to 00127 // various streams at best allow a single character to be peeked 00128 // at, and secondly, do not allow for recovery after an underfling 00129 // framing error occurs. 00130 // 00131 // That is, getchar()/putchar/ungetchar() support a single byte 00132 // peek. This may be fine for simply parsing applications, but here 00133 // we need to look at blocks up to 64K or more in size to search 00134 // for signatures while re-synchronizing on the underlying stream. 00135 // 00136 class PeekaheadBuffer : public std::vector<uint8_t> { 00137 00138 protected: 00139 ssize_t m_startPosition; // start of fresh data 00140 00141 public: 00142 enum ReturnCode { 00143 endOfFile = -1, 00144 reSync = 0, 00145 ok = 1 00146 }; 00147 00148 ssize_t startPosition() {return m_startPosition;} 00149 00150 private: 00151 // 00152 // when remaining data is 1/8 the size of the full 00153 // buffer, shift it back down to the start. 00154 // 00155 // for use by read(), which will consume data from the buffer. 00156 // 00157 void shiftData() { 00158 if(dataRemaining() < (ssize_t) (std::vector<uint8_t>::size() / 8) ) { 00159 erase(begin(), begin() + m_startPosition); 00160 m_startPosition = 0; 00161 } 00162 } 00163 // called when read reports an error for some 00164 // reason - 00165 virtual ReturnCode sync(); 00166 public: 00167 PeekaheadBuffer(); 00168 virtual ~PeekaheadBuffer(); 00169 00170 // return the amount of unused data: 00171 ssize_t dataRemaining(); 00172 00173 // 00174 // overload size() to throw an exception - too confusing otherwise 00175 // size_t size() {abort();} 00176 00177 // 00178 // just populate data in buffer from stream - not generic 00179 // 00180 // XXX note that it simply ensures that count bytes of data 00181 // are actually loaded into the buffer - if that amount of 00182 // data (or more) is present, this call is a NOP. 00183 // 00184 virtual ReturnCode readahead(ssize_t count) = 0; 00185 00186 // read is generic. 00187 // remove data from our buffer - call non-generic readahead to populate data. 00188 ReturnCode read(uint8_t *buffer, ssize_t count) { 00189 ReturnCode rc; 00190 00191 rc = readahead(count); 00192 00193 if(rc == ok) { 00194 uint8_t *src = &(*begin()) + m_startPosition; 00195 uint8_t *dest = buffer; 00196 00197 memcpy(dest, src, count); 00198 00199 m_startPosition += count; // consume data 00200 00201 // recover space if wasting too much: 00202 shiftData(); 00203 } else if(rc == reSync) { 00204 // peek puked - CRC error, other errors, see if we can sync forwards 00205 return reSync; 00206 } else { 00207 // failed to get needed data - premature EOF, I guess 00208 return endOfFile; 00209 } 00210 00211 return ok; 00212 } 00213 00214 }; 00215 00216 PeekaheadBuffer::PeekaheadBuffer() : m_startPosition(0) 00217 { 00218 } 00219 00220 PeekaheadBuffer::~PeekaheadBuffer() 00221 { 00222 } 00223 00224 PeekaheadBuffer::ReturnCode PeekaheadBuffer::sync() { 00225 clear(); 00226 return ok; 00227 } 00228 00229 ssize_t PeekaheadBuffer::dataRemaining() 00230 { 00231 return std::vector<uint8_t>::size() - m_startPosition; 00232 } 00233 00234 00235 // peekahead buffered file reader class 00236 class FileReader : public PeekaheadBuffer { 00237 FILE *m_stream; 00238 public: 00239 FileReader(); 00240 ~FileReader(); 00241 FileReader(FILE *stream); 00242 PeekaheadBuffer::ReturnCode readahead(ssize_t count); 00243 FILE *stream() {return m_stream;} 00244 bool eof() {return m_stream ? feof(m_stream) : false;} 00245 }; 00246 00247 FileReader::FileReader() 00248 { 00249 m_stream = NULL; 00250 } 00251 00252 FileReader::FileReader(FILE *stream) : m_stream(stream) 00253 { 00254 } 00255 00256 FileReader::~FileReader() 00257 { 00258 fclose(m_stream); 00259 m_stream = NULL; 00260 } 00261 00262 // 00263 // fill buffer until we have count bytes of valid 00264 // data. 00265 // 00266 // need to detect error and eof and return appropriate values. 00267 // 00268 PeekaheadBuffer::ReturnCode FileReader::readahead(ssize_t count) 00269 { 00270 uint8_t buffer[4096]; 00271 while(dataRemaining() < count) { 00272 int bytesRead = fread(buffer, 1, sizeof(buffer), m_stream); 00273 if(bytesRead==0) { 00274 if(ferror(m_stream)) { 00275 return reSync; 00276 } 00277 // ain't getting no more data... 00278 return endOfFile; 00279 } 00280 #if 0 00281 fprintf(stderr, "\n\n"); 00282 int possible = -1; 00283 for(int i=0;i<bytesRead;i+=16) { 00284 fprintf(stderr,"%08x: ", i); 00285 for(int j=0;j<16;j++) { 00286 if(buffer[i+j]==31 && buffer[i+j+1]==139) { 00287 possible = i+j; 00288 } 00289 fprintf(stderr,"%02x ", buffer[i+j]); 00290 } 00291 fprintf(stderr, "\n"); 00292 } 00293 if(possible>0) { 00294 fprintf(stderr,"possible signature at %08x\n", possible); 00295 } 00296 #endif 00297 insert(end(), &buffer[0], &buffer[0] + bytesRead); 00298 } 00299 return ok; 00300 } 00301 00302 class BGZFReader : public PeekaheadBuffer { 00303 FileReader m_fileReader; 00304 00305 public: 00306 00307 BGZFReader(FILE *stream) : m_fileReader(stream) {;} 00308 00309 PeekaheadBuffer::ReturnCode readahead(ssize_t count); 00310 00311 // 00312 // This will be reading data, and needs to return EOF, etc 00313 // 00314 ReturnCode sync() { 00315 // my internal data is now bad, so we'll scan ahead seeing 00316 // if we can find a good header 00317 clear(); 00318 PeekaheadBuffer::ReturnCode rc; 00319 while((rc = m_fileReader.readahead(sizeof(BGZFHeader)))==ok ) { 00320 BGZFHeader *header; 00321 if(rc==endOfFile) return rc; 00322 // a rc==reSync is ok provided readahead still ensures that header is present 00323 void *src = &(*(m_fileReader.begin())) + m_fileReader.startPosition(); 00324 header = (BGZFHeader *) src; 00325 if(header->sane()) { 00326 if(debug) std::cerr << "BGZFReader::sync returning reSync\n"; 00327 return reSync; // tell caller they need to sync up 00328 } 00329 // consume a byte, then see if we're at a valid block header 00330 uint8_t throwAwayBuffer; 00331 rc = m_fileReader.read(&throwAwayBuffer, 1); 00332 } 00333 return rc; 00334 } 00335 FILE *stream() {return m_fileReader.stream();} 00336 00337 bool eof() {return dataRemaining()==0 && m_fileReader.eof();} 00338 00339 }; 00340 00341 PeekaheadBuffer::ReturnCode BGZFReader::readahead(ssize_t count) 00342 { 00343 BGZFHeader header; 00344 // size of inflateBuffer can be determined from ISIZE, I think 00345 uint8_t inflateBuffer[64*1024]; 00346 uint8_t gzipBuffer[64*1024+1]; 00347 00348 while(dataRemaining() < count) { 00349 static int loopCount = 0; 00350 00351 if(debug) std::cerr << "BGZFReader::readahead loopcount = " << loopCount++ << "\n"; 00352 00353 // here we actually read data: 00354 // read what should be the header 00355 // verify the header 00356 // read the remainder of the block 00357 // check the CRC validity or perhaps just call unzip 00358 // 00359 // XXX the sizeof(header) is wrong: 00360 PeekaheadBuffer::ReturnCode rc = m_fileReader.read((uint8_t *) (&header), sizeof(header)); 00361 00362 if(rc == endOfFile) { 00363 return endOfFile; 00364 } 00365 00366 // if we have a bad header, start looking forward for a good one, 00367 if(!header.sane()) { 00368 // sync does not consume the next good header, it simply syncs() 00369 // the data stream to the next believed good BGZF header: 00370 if(debug) std::cerr << "BGZFReader::readahead found corrupt BGZF header - now calling sync()\n"; 00371 rc = sync(); 00372 // 00373 // even though we can now decompress, we need to tell the caller 00374 // what is up before they call for more data (caller needs to 00375 // sync its own record stream): 00376 return rc; 00377 } 00378 00379 // Read the remainder of the block. 00380 // BSIZE is size of the entire block - 1, so compensate. 00381 rc = m_fileReader.read((uint8_t *) &gzipBuffer, header.BSIZE() + 1 - sizeof(header)); 00382 00383 if(rc == reSync) { 00384 if(debug) std::cerr << "BGZFReader::readahead got incomplete BGZF read - now calling sync()\n"; 00385 sync(); 00386 return reSync; 00387 } 00388 00389 // 00390 // we read a header, but our attempt to read more data ended early, 00391 // so best to just return EOF 00392 // 00393 if(rc == endOfFile) { 00394 return rc; 00395 } 00396 00397 PeekaheadBuffer::ReturnCode bgzf_rc = ok; 00398 // zs.opaque is set when zalloc is NULL 00399 // 00400 // NB: zlib inflateInit2() has valgrind errors 00401 // in versions <1.2.4 - those can be ignored. 00402 // 00403 z_stream zs; 00404 zs.zalloc = NULL; 00405 zs.zfree = NULL; 00406 zs.next_in = gzipBuffer; 00407 zs.avail_in = header.BSIZE() - 16; // XXX need to check docs for inflate 00408 zs.next_out = inflateBuffer; 00409 zs.avail_out = sizeof(inflateBuffer); 00410 00411 // -15 --> raw inflate - don't look for gzip or zlib header 00412 // This can be optimized - inflateInit2 does a malloc of 00413 // approximately 10K (sizeof(inflate_state)) 00414 if(inflateInit2(&zs, -15) != Z_OK) { 00415 bgzf_rc = reSync; 00416 if(debug) std::cerr << "BGZFReader::readahead - inflateInit2 failed (out of memory?)\n"; 00417 // XXX fatal? 00418 } 00419 if(bgzf_rc==ok && inflate(&zs, Z_FINISH) != Z_STREAM_END) { 00420 bgzf_rc = reSync; 00421 if(debug) std::cerr << "BGZFReader::readahead - inflate failed (bad data), calling sync()\n"; 00422 } 00423 00424 if(bgzf_rc == ok) { 00425 if(inflateEnd(&zs) == Z_OK) { 00426 // do something with zs.total_out 00427 if(debug) std::cout << "hey, got data! zs.total_out == " << zs.total_out << "\n"; 00428 00429 // append the newly decompressed data 00430 insert(end(), &inflateBuffer[0], &inflateBuffer[0] + zs.total_out); 00431 } else { 00432 // seems exceptionall unlikely, but check this error case too 00433 bgzf_rc = reSync; 00434 if(debug) std::cerr << "BGZFReader::readahead - inflateInit2 failed (out of memory?)\n"; 00435 // XXX fatal? 00436 } 00437 } 00438 00439 if(bgzf_rc != ok) { 00440 inflateEnd(&zs); 00441 sync(); 00442 return bgzf_rc; 00443 } 00444 00445 // may need to get more data - loop back till all is complete 00446 } 00447 00448 return ok; 00449 00450 } 00451 00452 00453 #if 0 00454 void testBGZFBuffer() 00455 { 00456 BGZFReader b(stdin); 00457 std::vector<uint8_t>::iterator position; 00458 BGZFReader::ReturnCode rc; 00459 00460 std::cout << "size = " << b.dataRemaining() << "\n"; 00461 00462 // 00463 // this should: 00464 // decompress a BGZF block, populating the buffer with 00465 // unzipped data, possibly returning a BGZFBuffer::ReturnCode of 00466 // resync if it turns out the BGZF data was interrupted by bad 00467 // CRC checks. 00468 // 00469 rc = b.readahead(64); 00470 std::cout << "rc = " << rc << " - expect ok (1)\n"; 00471 std::cout << "size (expect 64) = " << b.size() << "\n"; 00472 } 00473 00474 00475 int main(int argc, const char **argv) 00476 { 00477 testBGZFBuffer(); 00478 } 00479 #endif 00480 00481 00482 00483 int BgzfFileTypeRecovery::close() 00484 { 00485 if(bgzfReader) delete bgzfReader; 00486 bgzfReader = NULL; 00487 return true; 00488 } 00489 00490 00491 BgzfFileTypeRecovery::BgzfFileTypeRecovery(const char * filename, const char * mode) 00492 { 00493 if(tolower(mode[0])=='r') { 00494 FILE *f = fopen(filename,"r"); 00495 bgzfReader = new BGZFReader(f); 00496 } else { 00497 // die for now 00498 if(debug) std::cerr << "Unable to open " << filename << " in mode " << mode << ".\n"; 00499 close(); 00500 } 00501 } 00502 00503 // 00504 // Why is this ever called? 00505 // 00506 bool BgzfFileTypeRecovery::operator == (void * rhs) 00507 { 00508 throw std::logic_error("BgzfFileTypeRecovery::operator == is dangerous - do not use"); 00509 return false; 00510 } 00511 00512 bool BgzfFileTypeRecovery::operator != (void * rhs) 00513 { 00514 throw std::logic_error("BgzfFileTypeRecovery::operator != is dangerous - do not use"); 00515 return false; 00516 } 00517 00518 int BgzfFileTypeRecovery::eof() 00519 { 00520 return bgzfReader->eof(); 00521 } 00522 00523 unsigned int BgzfFileTypeRecovery::write(const void * buffer, unsigned int size) 00524 { 00525 // currently unsupported 00526 return 0; 00527 } 00528 00529 int BgzfFileTypeRecovery::read(void * buffer, unsigned int size) 00530 { 00531 00532 if(bgzfReader == NULL) { 00533 return 0; 00534 } 00535 00536 PeekaheadBuffer::ReturnCode rc = bgzfReader->read((uint8_t *) buffer, size); 00537 // endOfFile = -1, 00538 // reSync = 0, 00539 // ok = 1 00540 switch(rc) { 00541 case PeekaheadBuffer::endOfFile: 00542 // set a flag? 00543 return 0; 00544 case PeekaheadBuffer::reSync: 00545 // we could encode more info in the exception message here: 00546 if(debug) std::cerr << "throwing BGZF sync exception\n"; 00547 throw std::runtime_error("BGZF stream resync"); 00548 case PeekaheadBuffer::ok: 00549 // 00550 // in bgzfReader, we always are ensured we 00551 // get the full amount of the read, otherwise 00552 // an error is thrown. 00553 // 00554 return size; 00555 } 00556 // NOTREACHED 00557 return 0; 00558 } 00559 00560 int64_t BgzfFileTypeRecovery::tell() 00561 { 00562 // currently unsupported 00563 return 0; 00564 } 00565 00566 bool BgzfFileTypeRecovery::seek(int64_t offset, int origin) 00567 { 00568 // currently unsupported 00569 return 0; 00570 } 00571 00572 00573 bool BgzfFileTypeRecovery::attemptRecoverySync(bool (*checkSignature)(void *data) , int length) 00574 { 00575 // 00576 // creep along a byte at a time, checking for signature. 00577 // 00578 // possibly slow. should only need to scan ahead < 64K bytes 00579 // or so, however, so should recover in "reasonable" time. 00580 // 00581 while( bgzfReader->readahead(length) == PeekaheadBuffer::ok) { 00582 char ch; 00583 void *src = &(*(bgzfReader->begin())) + bgzfReader->startPosition(); 00584 00585 // 00586 // readahead ensures we have 'length' bytes of 00587 // data to check that is valid in the buffer. 00588 // 00589 if((*checkSignature)(src)) return true; 00590 PeekaheadBuffer::ReturnCode rc = bgzfReader->read((uint8_t *) &ch, 1); 00591 if(rc!=PeekaheadBuffer::ok) return false; 00592 // we consumed a byte, so go back to top of loop, 00593 // resume filling buffer (if need be) and re-check 00594 } 00595 00596 00597 return false; 00598 } 00599 00600 #endif