00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 /*! \file */ 00018 #ifndef __INPUTFILE_H__ 00019 #define __INPUTFILE_H__ 00020 00021 #ifdef __gnu_linux__ 00022 #ifndef __ZLIB_AVAILABLE__ 00023 #define __ZLIB_AVAILABLE__ 00024 #endif 00025 #endif 00026 00027 #include <stdio.h> 00028 #include <iostream> 00029 #include <cstring> 00030 #include <stdint.h> 00031 00032 #include "FileType.h" 00033 00034 /// Class for easily reading/writing files without having to worry about 00035 /// file type (uncompressed, gzip, bgzf) when reading. 00036 class InputFile 00037 { 00038 bool myAttemptRecovery; // use recovery techniques if possible 00039 public: 00040 00041 /// Compression to use when writing a file & decompression used when 00042 /// reading a file from stdin. Any other read checks the file to determine 00043 /// how to uncompress it. 00044 enum ifileCompression { 00045 DEFAULT, ///< Check the extension, if it is ".gz", treat as gzip, otherwise treat it as UNCOMPRESSED. 00046 UNCOMPRESSED, ///< uncompressed file. 00047 GZIP, ///< gzip file. 00048 BGZF ///< bgzf file. 00049 }; 00050 00051 /// Default constructor 00052 InputFile() 00053 { 00054 myAttemptRecovery = false; 00055 myFileTypePtr = NULL; 00056 myBufferIndex = 0; 00057 myCurrentBufferSize = 0; 00058 // Default to buffer. 00059 myAllocatedBufferSize = DEFAULT_BUFFER_SIZE; 00060 myFileBuffer = new char[myAllocatedBufferSize]; 00061 myFileName.clear(); 00062 } 00063 00064 /// Destructor 00065 ~InputFile(); 00066 00067 /// Constructor for opening a file. 00068 /// \param filename file to open 00069 /// \param mode same format as fopen: "r" for read & "w" for write. 00070 /// \param compressionMode set the type of file to open for writing or 00071 /// for reading from stdin (when reading files, the compression type is 00072 /// determined by reading the file). 00073 InputFile(const char * filename, const char * mode, 00074 InputFile::ifileCompression compressionMode = InputFile::DEFAULT); 00075 00076 /// Set the buffer size for reading from files so that bufferSize bytes 00077 /// are read at a time and stored until accessed by another read call. 00078 /// This improves performance over reading the file small bits at a time. 00079 /// Buffering reads disables the tell call for bgzf files. 00080 /// Any previous values in the buffer will be deleted. 00081 /// \param bufferSize number of bytes to read/buffer at a time, 00082 /// default buffer size is 1048576, and turn off read buffering by setting 00083 /// bufferSize = 1; 00084 inline void bufferReads(unsigned int bufferSize = DEFAULT_BUFFER_SIZE) 00085 { 00086 // If the buffer size is the same, do nothing. 00087 if(bufferSize == myAllocatedBufferSize) 00088 { 00089 return; 00090 } 00091 // Delete the previous buffer. 00092 if(myFileBuffer != NULL) 00093 { 00094 delete[] myFileBuffer; 00095 } 00096 myBufferIndex = 0; 00097 myCurrentBufferSize = 0; 00098 // The buffer size must be at least 1 so one character can be 00099 // read and ifgetc can just assume reading into the buffer. 00100 if(bufferSize < 1) 00101 { 00102 bufferSize = 1; 00103 } 00104 myFileBuffer = new char[bufferSize]; 00105 myAllocatedBufferSize = bufferSize; 00106 00107 if(myFileTypePtr != NULL) 00108 { 00109 if(bufferSize == 1) 00110 { 00111 myFileTypePtr->setBuffered(false); 00112 } 00113 else 00114 { 00115 myFileTypePtr->setBuffered(true); 00116 } 00117 } 00118 } 00119 00120 00121 /// Disable read buffering. 00122 inline void disableBuffering() 00123 { 00124 bufferReads(1); 00125 if(myFileTypePtr != NULL) 00126 { 00127 myFileTypePtr->setBuffered(false); 00128 } 00129 } 00130 00131 00132 /// Close the file. 00133 /// \return status of the close (0 is success). 00134 inline int ifclose() 00135 { 00136 if (myFileTypePtr == NULL) 00137 { 00138 return EOF; 00139 } 00140 int result = myFileTypePtr->close(); 00141 delete myFileTypePtr; 00142 myFileTypePtr = NULL; 00143 myFileName.clear(); 00144 return result; 00145 } 00146 00147 /// Read size bytes from the file into the buffer. 00148 /// \param buffer pointer to memory at least size bytes big to write the 00149 /// data into. 00150 /// \param size number of bytes to be read 00151 /// \return number of bytes read, if it is not equal to size, 00152 /// there was either an error or the end of the file was reached, use 00153 /// ifeof to determine which case it was. 00154 inline int ifread(void * buffer, unsigned int size) 00155 { 00156 // There are 2 cases: 00157 // 1) There are already size available bytes in buffer. 00158 // 2) There are not size bytes in buffer. 00159 00160 // Determine the number of available bytes in the buffer. 00161 unsigned int availableBytes = myCurrentBufferSize - myBufferIndex; 00162 int returnSize = 0; 00163 00164 // Case 1: There are already size available bytes in buffer. 00165 if (size <= availableBytes) 00166 { 00167 // Just copy from the buffer, increment the index and return. 00168 memcpy(buffer, myFileBuffer+myBufferIndex, size); 00169 // Increment the buffer index. 00170 myBufferIndex += size; 00171 returnSize = size; 00172 } 00173 // Case 2: There are not size bytes in buffer. 00174 else 00175 { 00176 // Check to see if there are some bytes in the buffer. 00177 if (availableBytes > 0) 00178 { 00179 // Size > availableBytes > 0 00180 // Copy the available bytes into the buffer. 00181 memcpy(buffer, myFileBuffer+myBufferIndex, availableBytes); 00182 } 00183 // So far availableBytes have been copied into the read buffer. 00184 returnSize = availableBytes; 00185 // Increment myBufferIndex by what was read. 00186 myBufferIndex += availableBytes; 00187 00188 unsigned int remainingSize = size - availableBytes; 00189 00190 // Check if the remaining size is more or less than the 00191 // max buffer size. 00192 if(remainingSize < myAllocatedBufferSize) 00193 { 00194 // the remaining size is not the full buffer, but read 00195 // a full buffer worth of data anyway. 00196 myCurrentBufferSize = 00197 readFromFile(myFileBuffer, myAllocatedBufferSize); 00198 00199 // Check for an error. 00200 if(myCurrentBufferSize <= 0) 00201 { 00202 // No more data was successfully read, so check to see 00203 // if any data was copied to the return buffer at all. 00204 if( returnSize == 0) 00205 { 00206 // No data has been copied at all into the 00207 // return read buffer, so just return the value 00208 // returned from readFromFile. 00209 returnSize = myCurrentBufferSize; 00210 // Otherwise, returnSize is already set to the 00211 // available bytes that was already copied (so no 00212 // else statement is needed). 00213 } 00214 // Set myBufferIndex & myCurrentBufferSize to 0. 00215 myCurrentBufferSize = 0; 00216 myBufferIndex = 0; 00217 } 00218 else 00219 { 00220 // Successfully read more data. 00221 // Check to see how much was copied. 00222 int copySize = remainingSize; 00223 if(copySize > myCurrentBufferSize) 00224 { 00225 // Not the entire requested amount was read 00226 // (either from EOF or there was a partial read due to 00227 // an error), so set the copySize to what was read. 00228 copySize = myCurrentBufferSize; 00229 } 00230 00231 // Now copy the rest of the bytes into the buffer. 00232 memcpy((char*)buffer+availableBytes, 00233 myFileBuffer, copySize); 00234 00235 // set the buffer index to the location after what we are 00236 // returning as read. 00237 myBufferIndex = copySize; 00238 00239 returnSize += copySize; 00240 } 00241 } 00242 else 00243 { 00244 // More remaining to be read than the max buffer size, so just 00245 // read directly into the output buffer. 00246 int readSize = readFromFile((char*)buffer + availableBytes, 00247 remainingSize); 00248 00249 // Already used the buffer, so "clear" it. 00250 myCurrentBufferSize = 0; 00251 myBufferIndex = 0; 00252 if(readSize <= 0) 00253 { 00254 // No more data was successfully read, so check to see 00255 // if any data was copied to the return buffer at all. 00256 if(returnSize == 0) 00257 { 00258 // No data has been copied at all into the 00259 // return read buffer, so just return the value 00260 // returned from readFromFile. 00261 returnSize = readSize; 00262 // Otherwise, returnSize is already set to the 00263 // available bytes that was already copied (so no 00264 // else statement is needed). 00265 } 00266 } 00267 else 00268 { 00269 // More data was read, so increment the return count. 00270 returnSize += readSize; 00271 } 00272 } 00273 } 00274 return(returnSize); 00275 } 00276 00277 00278 /// Get a character from the file. Read a character from the internal 00279 /// buffer, or if the end of the buffer has been reached, read from the 00280 /// file into the buffer and return index 0. 00281 /// \return character that was read or EOF. 00282 inline int ifgetc() 00283 { 00284 if (myBufferIndex >= myCurrentBufferSize) 00285 { 00286 // at the last index, read a new buffer. 00287 myCurrentBufferSize = readFromFile(myFileBuffer, myAllocatedBufferSize); 00288 myBufferIndex = 0; 00289 } 00290 // If the buffer index is still greater than or equal to the 00291 // myCurrentBufferSize, then we failed to read the file - return EOF. 00292 if (myBufferIndex >= myCurrentBufferSize) 00293 { 00294 return(EOF); 00295 } 00296 return(myFileBuffer[myBufferIndex++]); 00297 } 00298 00299 /// Reset to the beginning of the file. 00300 inline void ifrewind() 00301 { 00302 // Just set the myBufferIndex and the myCurrentBufferSize to 0 to simulate 00303 // clearing the buffer and call rewind to move to the beginning of the 00304 // file. 00305 if (myFileTypePtr == NULL) 00306 { 00307 // No pointer, so nothing to rewind. 00308 return; 00309 } 00310 myCurrentBufferSize = 0; 00311 myBufferIndex = 0; 00312 myFileTypePtr->rewind(); 00313 } 00314 00315 00316 /// Check to see if we have reached the EOF. 00317 /// \return 0 if not EOF, any other value means EOF. 00318 inline int ifeof() 00319 { 00320 // Not EOF if we are not at the end of the buffer. 00321 if (myBufferIndex < myCurrentBufferSize) 00322 { 00323 // There are still available bytes in the buffer, so NOT EOF. 00324 return false; 00325 } 00326 else 00327 { 00328 if (myFileTypePtr == NULL) 00329 { 00330 // No myFileTypePtr, so not eof (return 0). 00331 return 0; 00332 } 00333 // exhausted our buffer, so check the file for eof. 00334 return myFileTypePtr->eof(); 00335 } 00336 } 00337 00338 /// Write the specified buffer into the file. 00339 /// \param buffer buffer containing size bytes to write to the file. 00340 /// \param size number of bytes to write 00341 /// \return number of bytes written 00342 /// We do not buffer the write call, so just leave this as normal. 00343 inline unsigned int ifwrite(const void * buffer, unsigned int size) 00344 { 00345 if (myFileTypePtr == NULL) 00346 { 00347 // No myFileTypePtr, so return 0 - nothing written. 00348 return 0; 00349 } 00350 return myFileTypePtr->write(buffer, size); 00351 } 00352 00353 /// Returns whether or not the file was successfully opened. 00354 /// \return true if the file is open, false if not. 00355 inline bool isOpen() 00356 { 00357 // It is open if the myFileTypePtr is set and says it is open. 00358 if ((myFileTypePtr != NULL) && myFileTypePtr->isOpen()) 00359 { 00360 return true; 00361 } 00362 // File was not successfully opened. 00363 return false; 00364 } 00365 00366 /// Get current position in the file. 00367 /// \return current position in the file, -1 indicates an error. 00368 inline int64_t iftell() 00369 { 00370 if (myFileTypePtr == NULL) 00371 { 00372 // No myFileTypePtr, so return false - could not seek. 00373 return -1; 00374 } 00375 int64_t pos = myFileTypePtr->tell(); 00376 pos -= (myCurrentBufferSize - myBufferIndex); 00377 return(pos); 00378 } 00379 00380 00381 /// Seek to the specified offset from the origin. 00382 /// \param offset offset into the file to move to (must be from a tell call) 00383 /// \param origin can be any of the following: 00384 /// Note: not all are valid for all filetypes. 00385 /// SEEK_SET - Beginning of file 00386 /// SEEK_CUR - Current position of the file pointer 00387 /// SEEK_END - End of file 00388 /// \return true on successful seek and false on a failed seek. 00389 inline bool ifseek(int64_t offset, int origin) 00390 { 00391 if (myFileTypePtr == NULL) 00392 { 00393 // No myFileTypePtr, so return false - could not seek. 00394 return false; 00395 } 00396 // TODO - may be able to seek within the buffer if applicable. 00397 // Reset buffering since a seek is being done. 00398 myBufferIndex = 0; 00399 myCurrentBufferSize = 0; 00400 return myFileTypePtr->seek(offset, origin); 00401 } 00402 00403 /// Get the filename that is currently opened. 00404 /// \return filename associated with this class 00405 const char* getFileName() const 00406 { 00407 return(myFileName.c_str()); 00408 } 00409 00410 /// Enable (default) or disable recovery. 00411 /// 00412 /// When true, we can attach a myFileTypePtr 00413 /// that implements a recovery capable decompressor. 00414 /// This requires that the caller be able to catch 00415 /// the exception XXX "blah blah blah". 00416 /// 00417 void setAttemptRecovery(bool flag = false) 00418 { 00419 myAttemptRecovery = flag; 00420 } 00421 00422 bool attemptRecoverySync(bool (*checkSignature)(void *data) , int length) 00423 { 00424 if(myFileTypePtr==NULL) return false; 00425 return myFileTypePtr->attemptRecoverySync(checkSignature, length); 00426 } 00427 00428 // Open a file. Called by the constructor. 00429 // Returns true if the file was successfully opened, false otherwise. 00430 bool openFile(const char * filename, const char * mode, 00431 InputFile::ifileCompression compressionMode); 00432 00433 protected: 00434 // Read into a buffer from the file. Since the buffer is passed in and 00435 // this would bypass the myFileBuffer used by this class, this method must 00436 // be protected. 00437 inline int readFromFile(void * buffer, unsigned int size) 00438 { 00439 // If no myFileTypePtr, return 0 - nothing read. 00440 if (myFileTypePtr == NULL) 00441 { 00442 return 0; 00443 } 00444 return myFileTypePtr->read(buffer, size); 00445 } 00446 00447 #ifdef __ZLIB_AVAILABLE__ 00448 // Only necessary with zlib to determine what file type on a new 00449 // file. Without zlib, there are only uncompressed files, so a special 00450 // method is not needed to determine the type of file to open. 00451 // Open a file. This method will open a file with the specified name and 00452 // mode with the fileTypePtr associated with the specified compressionMode. 00453 void openFileUsingMode(const char* filename, const char* mode, 00454 InputFile::ifileCompression compressionMode); 00455 #endif 00456 00457 // The size of the buffer used by this class. 00458 static const unsigned int DEFAULT_BUFFER_SIZE = 1048576; 00459 00460 // Pointer to a class that interfaces with different file types. 00461 FileType* myFileTypePtr; 00462 00463 unsigned int myAllocatedBufferSize; 00464 00465 // Buffer used to do large reads rather than 1 by 1 character reads 00466 // from the file. The class is then managed to iterate through the buffer. 00467 char* myFileBuffer; 00468 00469 // Current index into the buffer. Used to track where we are in reading the 00470 // file from the buffer. 00471 int myBufferIndex; 00472 00473 // Current number of entries in the buffer. Used to ensure that 00474 // if a read did not fill the buffer, we stop before hitting the 00475 // end of what was read. 00476 int myCurrentBufferSize; 00477 00478 std::string myFileName; 00479 }; 00480 00481 00482 /// Define IFILE as a pointer to an InputFile object. 00483 typedef InputFile* IFILE; 00484 00485 00486 /// Open a file. 00487 /// \param filename file to open 00488 /// \param mode same format as fopen: "r" for read & "w" for write. 00489 /// \param compressionMode set the type of file to open for writing or 00490 /// for reading from stdin (when reading files, the compression type is 00491 /// determined by reading the file). 00492 /// \return IFILE - pointer to the InputFile object that has been opened. 00493 inline IFILE ifopen(const char * filename, const char * mode, 00494 InputFile::ifileCompression compressionMode = InputFile::DEFAULT) 00495 { 00496 IFILE file = new InputFile(filename, mode, compressionMode); 00497 if (!file->isOpen()) 00498 { 00499 00500 // Not open, so delete the file, and return null. 00501 delete file; 00502 file = NULL; 00503 } 00504 return file; 00505 } 00506 00507 00508 /// Close the file. 00509 /// \param file file to be closed - IFILE is a pointer to an InputFile object 00510 /// \return status of the close (0 is success or if NULL is passed in). 00511 inline int ifclose(IFILE file) 00512 { 00513 if(file == NULL) 00514 { 00515 // NULL Pointer passed in, so return 0, since no file is open, so 00516 // does not need to be closed. 00517 return(0); 00518 } 00519 int result = file->ifclose(); 00520 delete file; 00521 file = NULL; 00522 return(result); 00523 } 00524 00525 /// Read size bytes from the file into the buffer. 00526 /// \param file file to be read - IFILE is a pointer to an InputFile object 00527 /// \param buffer pointer to memory at least size bytes big to write the 00528 /// data into. 00529 /// \param size number of bytes to be read 00530 /// \return number of bytes read 00531 inline unsigned int ifread(IFILE file, void * buffer, unsigned int size) 00532 { 00533 if(file == NULL) 00534 { 00535 // No file was passed in, so 0 bytes were read. 00536 return(0); 00537 } 00538 return(file->ifread(buffer, size)); 00539 } 00540 00541 /// Get a character from the file. Read a character from the internal 00542 /// buffer, or if the end of the buffer has been reached, read from the 00543 /// file into the buffer and return index 0. 00544 /// \param file file to be read - IFILE is a pointer to an InputFile object 00545 /// \return character that was read or EOF. 00546 inline int ifgetc(IFILE file) 00547 { 00548 if(file == NULL) 00549 { 00550 // return eof since there is no file. 00551 return(EOF); 00552 } 00553 return(file->ifgetc()); 00554 } 00555 00556 /// Reset to the beginning of the file. 00557 /// \param file file to be rewound - IFILE is a pointer to an InputFile object 00558 inline void ifrewind(IFILE file) 00559 { 00560 if(file == NULL) 00561 { 00562 return; 00563 } 00564 file->ifrewind(); 00565 } 00566 00567 /// Check to see if we have reached the EOF. 00568 /// \param file file to be checked - IFILE is a pointer to an InputFile object 00569 /// \return 0 if not EOF, any other value means EOF. 00570 inline int ifeof(IFILE file) 00571 { 00572 if(file == NULL) 00573 { 00574 // No file, so that is considered to be EOF, so return 1. 00575 return(1); 00576 } 00577 return(file->ifeof()); 00578 } 00579 00580 /// Write the specified buffer into the file. 00581 /// \param file file to write to - IFILE is a pointer to an InputFile object 00582 /// \param buffer buffer containing size bytes to write to the file. 00583 /// \param size number of bytes to write 00584 /// \return number of bytes written 00585 inline unsigned int ifwrite(IFILE file, const void * buffer, unsigned int size) 00586 { 00587 if(file == NULL) 00588 { 00589 // No file specified, so retun 0 bytes written. 00590 return(0); 00591 } 00592 return(file->ifwrite(buffer, size)); 00593 } 00594 00595 /// Get current position in the file. 00596 /// \param file file to perform tell on - IFILE is a pointer to an InputFile object 00597 /// \return current position in the file, -1 indicates an error. 00598 inline int64_t iftell(IFILE file) 00599 { 00600 if(file == NULL) 00601 { 00602 return(-1); 00603 } 00604 return (file->iftell()); 00605 } 00606 00607 /// Seek to the specified offset from the origin. 00608 /// \param file file to perform seek on - IFILE is a pointer to an InputFile object 00609 /// \param offset offset into the file to move to (must be from a tell call) 00610 /// \param origin can be any of the following: 00611 /// Note: not all are valid for all filetypes. 00612 /// SEEK_SET - Beginning of file 00613 /// SEEK_CUR - Current position of the file pointer 00614 /// SEEK_END - End of file 00615 /// \return true on successful seek and false on a failed seek. 00616 inline bool ifseek(IFILE file, int64_t offset, int origin) 00617 { 00618 if(file == NULL) 00619 { 00620 // Could not see since no file was specified. 00621 return(false); 00622 } 00623 return (file->ifseek(offset, origin)); 00624 } 00625 00626 /// Write to a file using fprintf format. 00627 /// \param file file to write to - IFILE is a pointer to an InputFile object 00628 /// \param format printf format for writing, followed by parameters. 00629 /// \return number of bytes written 00630 int ifprintf(IFILE output, const char * format, ...); 00631 00632 /// Read a line from a file using streaming. 00633 /// \param stream file to read from - IFILE is a pointer to an InputFile object 00634 /// \param str output string containing the line read from the file. 00635 inline IFILE operator >> (IFILE stream, std::string &str) 00636 { 00637 str.clear(); 00638 int ch; 00639 // not safe... newline handling? 00640 while ((ch = stream->ifgetc())!=EOF && (ch != '\n')) str.push_back(ch); 00641 return stream; 00642 } 00643 00644 #endif 00645