00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 /*! \file */ 00018 #ifndef __INPUTFILE_H__ 00019 #define __INPUTFILE_H__ 00020 00021 #ifdef __gnu_linux__ 00022 #ifndef __ZLIB_AVAILABLE__ 00023 #define __ZLIB_AVAILABLE__ 00024 #endif 00025 #endif 00026 00027 #include <stdio.h> 00028 #include <iostream> 00029 #include <cstring> 00030 #include <stdint.h> 00031 00032 #include "FileType.h" 00033 00034 /// Class for easily reading/writing files without having to worry about 00035 /// file type (uncompressed, gzip, bgzf) when reading. 00036 /// It hides the low level file operations/structure from the user, allowing 00037 /// them to generically open and operate on a file using the same 00038 /// interface without knowing the file format (standard uncompressed, 00039 /// gzip, or bgzf). For writing, the user must specify the file type. 00040 /// There is a typedef IFILE which is InputFile* and setup to mimic FILE 00041 /// including global methods that take IFILE as a parameter. 00042 class InputFile 00043 { 00044 bool myAttemptRecovery; // use recovery techniques if possible 00045 public: 00046 00047 /// Compression to use when writing a file & decompression used when 00048 /// reading a file from stdin. Any other read checks the file to determine 00049 /// how to uncompress it. 00050 enum ifileCompression { 00051 DEFAULT, ///< Check the extension, if it is ".gz", treat as gzip, otherwise treat it as UNCOMPRESSED. 00052 UNCOMPRESSED, ///< uncompressed file. 00053 GZIP, ///< gzip file. 00054 BGZF ///< bgzf file. 00055 }; 00056 00057 /// Default constructor 00058 InputFile() 00059 { 00060 myAttemptRecovery = false; 00061 myFileTypePtr = NULL; 00062 myBufferIndex = 0; 00063 myCurrentBufferSize = 0; 00064 // Default to buffer. 00065 myAllocatedBufferSize = DEFAULT_BUFFER_SIZE; 00066 myFileBuffer = new char[myAllocatedBufferSize]; 00067 myFileName.clear(); 00068 } 00069 00070 /// Destructor 00071 ~InputFile(); 00072 00073 /// Constructor for opening a file. 00074 /// \param filename file to open 00075 /// \param mode same format as fopen: "r" for read & "w" for write. 00076 /// \param compressionMode set the type of file to open for writing or 00077 /// for reading from stdin (when reading files, the compression type is 00078 /// determined by reading the file). 00079 InputFile(const char * filename, const char * mode, 00080 InputFile::ifileCompression compressionMode = InputFile::DEFAULT); 00081 00082 /// Set the buffer size for reading from files so that bufferSize bytes 00083 /// are read at a time and stored until accessed by another read call. 00084 /// This improves performance over reading the file small bits at a time. 00085 /// Buffering reads disables the tell call for bgzf files. 00086 /// Any previous values in the buffer will be deleted. 00087 /// \param bufferSize number of bytes to read/buffer at a time, 00088 /// default buffer size is 1048576, and turn off read buffering by setting 00089 /// bufferSize = 1; 00090 inline void bufferReads(unsigned int bufferSize = DEFAULT_BUFFER_SIZE) 00091 { 00092 // If the buffer size is the same, do nothing. 00093 if(bufferSize == myAllocatedBufferSize) 00094 { 00095 return; 00096 } 00097 // Delete the previous buffer. 00098 if(myFileBuffer != NULL) 00099 { 00100 delete[] myFileBuffer; 00101 } 00102 myBufferIndex = 0; 00103 myCurrentBufferSize = 0; 00104 // The buffer size must be at least 1 so one character can be 00105 // read and ifgetc can just assume reading into the buffer. 00106 if(bufferSize < 1) 00107 { 00108 bufferSize = 1; 00109 } 00110 myFileBuffer = new char[bufferSize]; 00111 myAllocatedBufferSize = bufferSize; 00112 00113 if(myFileTypePtr != NULL) 00114 { 00115 if(bufferSize == 1) 00116 { 00117 myFileTypePtr->setBuffered(false); 00118 } 00119 else 00120 { 00121 myFileTypePtr->setBuffered(true); 00122 } 00123 } 00124 } 00125 00126 00127 /// Disable read buffering. 00128 inline void disableBuffering() 00129 { 00130 bufferReads(1); 00131 if(myFileTypePtr != NULL) 00132 { 00133 myFileTypePtr->setBuffered(false); 00134 } 00135 } 00136 00137 00138 /// Close the file. 00139 /// \return status of the close (0 is success). 00140 inline int ifclose() 00141 { 00142 if (myFileTypePtr == NULL) 00143 { 00144 return EOF; 00145 } 00146 int result = myFileTypePtr->close(); 00147 delete myFileTypePtr; 00148 myFileTypePtr = NULL; 00149 myFileName.clear(); 00150 return result; 00151 } 00152 00153 /// Read size bytes from the file into the buffer. 00154 /// \param buffer pointer to memory at least size bytes big to write the 00155 /// data into. 00156 /// \param size number of bytes to be read 00157 /// \return number of bytes read, if it is not equal to size, 00158 /// there was either an error or the end of the file was reached, use 00159 /// ifeof to determine which case it was. 00160 inline int ifread(void * buffer, unsigned int size) 00161 { 00162 // There are 2 cases: 00163 // 1) There are already size available bytes in buffer. 00164 // 2) There are not size bytes in buffer. 00165 00166 // Determine the number of available bytes in the buffer. 00167 unsigned int availableBytes = myCurrentBufferSize - myBufferIndex; 00168 int returnSize = 0; 00169 00170 // Case 1: There are already size available bytes in buffer. 00171 if (size <= availableBytes) 00172 { 00173 // Just copy from the buffer, increment the index and return. 00174 memcpy(buffer, myFileBuffer+myBufferIndex, size); 00175 // Increment the buffer index. 00176 myBufferIndex += size; 00177 returnSize = size; 00178 } 00179 // Case 2: There are not size bytes in buffer. 00180 else 00181 { 00182 // Check to see if there are some bytes in the buffer. 00183 if (availableBytes > 0) 00184 { 00185 // Size > availableBytes > 0 00186 // Copy the available bytes into the buffer. 00187 memcpy(buffer, myFileBuffer+myBufferIndex, availableBytes); 00188 } 00189 // So far availableBytes have been copied into the read buffer. 00190 returnSize = availableBytes; 00191 // Increment myBufferIndex by what was read. 00192 myBufferIndex += availableBytes; 00193 00194 unsigned int remainingSize = size - availableBytes; 00195 00196 // Check if the remaining size is more or less than the 00197 // max buffer size. 00198 if(remainingSize < myAllocatedBufferSize) 00199 { 00200 // the remaining size is not the full buffer, but read 00201 // a full buffer worth of data anyway. 00202 myCurrentBufferSize = 00203 readFromFile(myFileBuffer, myAllocatedBufferSize); 00204 00205 // Check for an error. 00206 if(myCurrentBufferSize <= 0) 00207 { 00208 // No more data was successfully read, so check to see 00209 // if any data was copied to the return buffer at all. 00210 if( returnSize == 0) 00211 { 00212 // No data has been copied at all into the 00213 // return read buffer, so just return the value 00214 // returned from readFromFile. 00215 returnSize = myCurrentBufferSize; 00216 // Otherwise, returnSize is already set to the 00217 // available bytes that was already copied (so no 00218 // else statement is needed). 00219 } 00220 // Set myBufferIndex & myCurrentBufferSize to 0. 00221 myCurrentBufferSize = 0; 00222 myBufferIndex = 0; 00223 } 00224 else 00225 { 00226 // Successfully read more data. 00227 // Check to see how much was copied. 00228 int copySize = remainingSize; 00229 if(copySize > myCurrentBufferSize) 00230 { 00231 // Not the entire requested amount was read 00232 // (either from EOF or there was a partial read due to 00233 // an error), so set the copySize to what was read. 00234 copySize = myCurrentBufferSize; 00235 } 00236 00237 // Now copy the rest of the bytes into the buffer. 00238 memcpy((char*)buffer+availableBytes, 00239 myFileBuffer, copySize); 00240 00241 // set the buffer index to the location after what we are 00242 // returning as read. 00243 myBufferIndex = copySize; 00244 00245 returnSize += copySize; 00246 } 00247 } 00248 else 00249 { 00250 // More remaining to be read than the max buffer size, so just 00251 // read directly into the output buffer. 00252 int readSize = readFromFile((char*)buffer + availableBytes, 00253 remainingSize); 00254 00255 // Already used the buffer, so "clear" it. 00256 myCurrentBufferSize = 0; 00257 myBufferIndex = 0; 00258 if(readSize <= 0) 00259 { 00260 // No more data was successfully read, so check to see 00261 // if any data was copied to the return buffer at all. 00262 if(returnSize == 0) 00263 { 00264 // No data has been copied at all into the 00265 // return read buffer, so just return the value 00266 // returned from readFromFile. 00267 returnSize = readSize; 00268 // Otherwise, returnSize is already set to the 00269 // available bytes that was already copied (so no 00270 // else statement is needed). 00271 } 00272 } 00273 else 00274 { 00275 // More data was read, so increment the return count. 00276 returnSize += readSize; 00277 } 00278 } 00279 } 00280 return(returnSize); 00281 } 00282 00283 00284 /// Get a character from the file. Read a character from the internal 00285 /// buffer, or if the end of the buffer has been reached, read from the 00286 /// file into the buffer and return index 0. 00287 /// \return character that was read or EOF. 00288 inline int ifgetc() 00289 { 00290 if (myBufferIndex >= myCurrentBufferSize) 00291 { 00292 // at the last index, read a new buffer. 00293 myCurrentBufferSize = readFromFile(myFileBuffer, myAllocatedBufferSize); 00294 myBufferIndex = 0; 00295 } 00296 // If the buffer index is still greater than or equal to the 00297 // myCurrentBufferSize, then we failed to read the file - return EOF. 00298 if (myBufferIndex >= myCurrentBufferSize) 00299 { 00300 return(EOF); 00301 } 00302 return(myFileBuffer[myBufferIndex++]); 00303 } 00304 00305 /// Reset to the beginning of the file. 00306 inline void ifrewind() 00307 { 00308 // Just set the myBufferIndex and the myCurrentBufferSize to 0 to simulate 00309 // clearing the buffer and call rewind to move to the beginning of the 00310 // file. 00311 if (myFileTypePtr == NULL) 00312 { 00313 // No pointer, so nothing to rewind. 00314 return; 00315 } 00316 myCurrentBufferSize = 0; 00317 myBufferIndex = 0; 00318 myFileTypePtr->rewind(); 00319 } 00320 00321 00322 /// Check to see if we have reached the EOF. 00323 /// \return 0 if not EOF, any other value means EOF. 00324 inline int ifeof() 00325 { 00326 // Not EOF if we are not at the end of the buffer. 00327 if (myBufferIndex < myCurrentBufferSize) 00328 { 00329 // There are still available bytes in the buffer, so NOT EOF. 00330 return false; 00331 } 00332 else 00333 { 00334 if (myFileTypePtr == NULL) 00335 { 00336 // No myFileTypePtr, so not eof (return 0). 00337 return 0; 00338 } 00339 // exhausted our buffer, so check the file for eof. 00340 return myFileTypePtr->eof(); 00341 } 00342 } 00343 00344 /// Write the specified buffer into the file. 00345 /// \param buffer buffer containing size bytes to write to the file. 00346 /// \param size number of bytes to write 00347 /// \return number of bytes written 00348 /// We do not buffer the write call, so just leave this as normal. 00349 inline unsigned int ifwrite(const void * buffer, unsigned int size) 00350 { 00351 if (myFileTypePtr == NULL) 00352 { 00353 // No myFileTypePtr, so return 0 - nothing written. 00354 return 0; 00355 } 00356 return myFileTypePtr->write(buffer, size); 00357 } 00358 00359 /// Returns whether or not the file was successfully opened. 00360 /// \return true if the file is open, false if not. 00361 inline bool isOpen() 00362 { 00363 // It is open if the myFileTypePtr is set and says it is open. 00364 if ((myFileTypePtr != NULL) && myFileTypePtr->isOpen()) 00365 { 00366 return true; 00367 } 00368 // File was not successfully opened. 00369 return false; 00370 } 00371 00372 /// Get current position in the file. 00373 /// \return current position in the file, -1 indicates an error. 00374 inline int64_t iftell() 00375 { 00376 if (myFileTypePtr == NULL) 00377 { 00378 // No myFileTypePtr, so return false - could not seek. 00379 return -1; 00380 } 00381 int64_t pos = myFileTypePtr->tell(); 00382 pos -= (myCurrentBufferSize - myBufferIndex); 00383 return(pos); 00384 } 00385 00386 00387 /// Seek to the specified offset from the origin. 00388 /// \param offset offset into the file to move to (must be from a tell call) 00389 /// \param origin can be any of the following: 00390 /// Note: not all are valid for all filetypes. 00391 /// SEEK_SET - Beginning of file 00392 /// SEEK_CUR - Current position of the file pointer 00393 /// SEEK_END - End of file 00394 /// \return true on successful seek and false on a failed seek. 00395 inline bool ifseek(int64_t offset, int origin) 00396 { 00397 if (myFileTypePtr == NULL) 00398 { 00399 // No myFileTypePtr, so return false - could not seek. 00400 return false; 00401 } 00402 // TODO - may be able to seek within the buffer if applicable. 00403 // Reset buffering since a seek is being done. 00404 myBufferIndex = 0; 00405 myCurrentBufferSize = 0; 00406 return myFileTypePtr->seek(offset, origin); 00407 } 00408 00409 /// Get the filename that is currently opened. 00410 /// \return filename associated with this class 00411 const char* getFileName() const 00412 { 00413 return(myFileName.c_str()); 00414 } 00415 00416 /// Enable (default) or disable recovery. 00417 /// 00418 /// When true, we can attach a myFileTypePtr 00419 /// that implements a recovery capable decompressor. 00420 /// This requires that the caller be able to catch 00421 /// the exception XXX "blah blah blah". 00422 /// 00423 void setAttemptRecovery(bool flag = false) 00424 { 00425 myAttemptRecovery = flag; 00426 } 00427 00428 bool attemptRecoverySync(bool (*checkSignature)(void *data) , int length) 00429 { 00430 if(myFileTypePtr==NULL) return false; 00431 return myFileTypePtr->attemptRecoverySync(checkSignature, length); 00432 } 00433 00434 // Open a file. Called by the constructor. 00435 // Returns true if the file was successfully opened, false otherwise. 00436 bool openFile(const char * filename, const char * mode, 00437 InputFile::ifileCompression compressionMode); 00438 00439 protected: 00440 // Read into a buffer from the file. Since the buffer is passed in and 00441 // this would bypass the myFileBuffer used by this class, this method must 00442 // be protected. 00443 inline int readFromFile(void * buffer, unsigned int size) 00444 { 00445 // If no myFileTypePtr, return 0 - nothing read. 00446 if (myFileTypePtr == NULL) 00447 { 00448 return 0; 00449 } 00450 return myFileTypePtr->read(buffer, size); 00451 } 00452 00453 #ifdef __ZLIB_AVAILABLE__ 00454 // Only necessary with zlib to determine what file type on a new 00455 // file. Without zlib, there are only uncompressed files, so a special 00456 // method is not needed to determine the type of file to open. 00457 // Open a file. This method will open a file with the specified name and 00458 // mode with the fileTypePtr associated with the specified compressionMode. 00459 void openFileUsingMode(const char* filename, const char* mode, 00460 InputFile::ifileCompression compressionMode); 00461 #endif 00462 00463 // The size of the buffer used by this class. 00464 static const unsigned int DEFAULT_BUFFER_SIZE = 1048576; 00465 00466 // Pointer to a class that interfaces with different file types. 00467 FileType* myFileTypePtr; 00468 00469 unsigned int myAllocatedBufferSize; 00470 00471 // Buffer used to do large reads rather than 1 by 1 character reads 00472 // from the file. The class is then managed to iterate through the buffer. 00473 char* myFileBuffer; 00474 00475 // Current index into the buffer. Used to track where we are in reading the 00476 // file from the buffer. 00477 int myBufferIndex; 00478 00479 // Current number of entries in the buffer. Used to ensure that 00480 // if a read did not fill the buffer, we stop before hitting the 00481 // end of what was read. 00482 int myCurrentBufferSize; 00483 00484 std::string myFileName; 00485 }; 00486 00487 00488 /// Define IFILE as a pointer to an InputFile object. 00489 typedef InputFile* IFILE; 00490 00491 00492 /// Open a file with the specified name and mode, using a filename of "-" to 00493 /// indicate stdin/stdout. 00494 /// \param filename file to open ("-" meands stdin/stdout) 00495 /// \param mode same format as fopen: "r" for read & "w" for write. 00496 /// \param compressionMode set the type of file to open for writing or 00497 /// for reading from stdin (when reading files not from stdin, the compression 00498 /// type is determined by reading the file). 00499 /// \return IFILE - pointer to the InputFile object that has been opened. 00500 inline IFILE ifopen(const char * filename, const char * mode, 00501 InputFile::ifileCompression compressionMode = InputFile::DEFAULT) 00502 { 00503 IFILE file = new InputFile(filename, mode, compressionMode); 00504 if (!file->isOpen()) 00505 { 00506 00507 // Not open, so delete the file, and return null. 00508 delete file; 00509 file = NULL; 00510 } 00511 return file; 00512 } 00513 00514 00515 /// Close the file. 00516 /// \param file file to be closed - IFILE is a pointer to an InputFile object 00517 /// \return status of the close (0 is success or if NULL is passed in). 00518 inline int ifclose(IFILE file) 00519 { 00520 if(file == NULL) 00521 { 00522 // NULL Pointer passed in, so return 0, since no file is open, so 00523 // does not need to be closed. 00524 return(0); 00525 } 00526 int result = file->ifclose(); 00527 delete file; 00528 file = NULL; 00529 return(result); 00530 } 00531 00532 /// Read up to size bytes from the file into the buffer. 00533 /// \param file file to be read - IFILE is a pointer to an InputFile object 00534 /// \param buffer pointer to memory at least size bytes big to write the 00535 /// data into. 00536 /// \param size number of bytes to be read 00537 /// \return number of bytes read 00538 inline unsigned int ifread(IFILE file, void * buffer, unsigned int size) 00539 { 00540 if(file == NULL) 00541 { 00542 // No file was passed in, so 0 bytes were read. 00543 return(0); 00544 } 00545 return(file->ifread(buffer, size)); 00546 } 00547 00548 /// Get a character from the file. Read a character from the internal 00549 /// buffer, or if the end of the buffer has been reached, read from the 00550 /// file into the buffer and return index 0. 00551 /// \param file file to be read - IFILE is a pointer to an InputFile object 00552 /// \return character that was read or EOF. 00553 inline int ifgetc(IFILE file) 00554 { 00555 if(file == NULL) 00556 { 00557 // return eof since there is no file. 00558 return(EOF); 00559 } 00560 return(file->ifgetc()); 00561 } 00562 00563 /// Reset to the beginning of the file (cannot be done for stdin/stdout). 00564 /// \param file file to be rewound - IFILE is a pointer to an InputFile object 00565 inline void ifrewind(IFILE file) 00566 { 00567 if(file == NULL) 00568 { 00569 return; 00570 } 00571 file->ifrewind(); 00572 } 00573 00574 /// Check to see if we have reached the EOF (returns 0 if not EOF). 00575 /// \param file file to be checked - IFILE is a pointer to an InputFile object 00576 /// \return 0 if not EOF, any other value means EOF. 00577 inline int ifeof(IFILE file) 00578 { 00579 if(file == NULL) 00580 { 00581 // No file, so that is considered to be EOF, so return 1. 00582 return(1); 00583 } 00584 return(file->ifeof()); 00585 } 00586 00587 /// Write the specified number of bytes from the specified buffer into the file. 00588 /// \param file file to write to - IFILE is a pointer to an InputFile object 00589 /// \param buffer buffer containing size bytes to write to the file. 00590 /// \param size number of bytes to write 00591 /// \return number of bytes written 00592 inline unsigned int ifwrite(IFILE file, const void * buffer, unsigned int size) 00593 { 00594 if(file == NULL) 00595 { 00596 // No file specified, so retun 0 bytes written. 00597 return(0); 00598 } 00599 return(file->ifwrite(buffer, size)); 00600 } 00601 00602 /// Get current position in the file. Can be fed back into ifseek. 00603 /// \param file file to perform tell on - IFILE is a pointer to an InputFile object 00604 /// \return current position in the file, -1 indicates an error. 00605 inline int64_t iftell(IFILE file) 00606 { 00607 if(file == NULL) 00608 { 00609 return(-1); 00610 } 00611 return (file->iftell()); 00612 } 00613 00614 /// Seek to the specified position (result from an iftell), but cannot 00615 /// be done for stdin/stdout. 00616 /// \param file file to perform seek on - IFILE is a pointer to an InputFile object 00617 /// \param offset offset into the file to move to (must be from a tell call) 00618 /// \param origin can be any of the following: 00619 /// Note: not all are valid for all filetypes. 00620 /// SEEK_SET - Beginning of file 00621 /// SEEK_CUR - Current position of the file pointer 00622 /// SEEK_END - End of file 00623 /// \return true on successful seek and false on a failed seek. 00624 inline bool ifseek(IFILE file, int64_t offset, int origin) 00625 { 00626 if(file == NULL) 00627 { 00628 // Could not see since no file was specified. 00629 return(false); 00630 } 00631 return (file->ifseek(offset, origin)); 00632 } 00633 00634 /// Write to a file using fprintf format. 00635 /// \param file file to write to - IFILE is a pointer to an InputFile object 00636 /// \param format printf format for writing, followed by parameters. 00637 /// \return number of bytes written 00638 int ifprintf(IFILE output, const char * format, ...); 00639 00640 /// Read a line from a file using streaming. 00641 /// \param stream file to read from - IFILE is a pointer to an InputFile object 00642 /// \param str output string containing the line read from the file. 00643 inline IFILE operator >> (IFILE stream, std::string &str) 00644 { 00645 str.clear(); 00646 int ch; 00647 // not safe... newline handling? 00648 while ((ch = stream->ifgetc())!=EOF && (ch != '\n')) str.push_back(ch); 00649 return stream; 00650 } 00651 00652 #endif 00653