00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 /*! \file */ 00018 #ifndef __INPUTFILE_H__ 00019 #define __INPUTFILE_H__ 00020 00021 #ifdef __gnu_linux__ 00022 #ifndef __ZLIB_AVAILABLE__ 00023 #define __ZLIB_AVAILABLE__ 00024 #endif 00025 #endif 00026 00027 #include <stdio.h> 00028 #include <iostream> 00029 #include <cstring> 00030 00031 #include "FileType.h" 00032 00033 /// Class for easily reading/writing files without having to worry about 00034 /// file type (uncompressed, gzip, bgzf) when reading. 00035 class InputFile 00036 { 00037 public: 00038 00039 /// Compression to use when writing a file & decompression used when 00040 /// reading a file from stdin. Any other read checks the file to determine 00041 /// how to uncompress it. 00042 enum ifileCompression { 00043 DEFAULT, ///< Check the extension, if it is ".gz", treat as gzip, otherwise treat it as UNCOMPRESSED. 00044 UNCOMPRESSED, ///< uncompressed file. 00045 GZIP, ///< gzip file. 00046 BGZF ///< bgzf file. 00047 }; 00048 00049 /// Default constructor 00050 InputFile() 00051 { 00052 myFileTypePtr = NULL; 00053 myBufferIndex = 0; 00054 myCurrentBufferSize = 0; 00055 // Default to buffer. 00056 myAllocatedBufferSize = DEFAULT_BUFFER_SIZE; 00057 myFileBuffer = new char[myAllocatedBufferSize]; 00058 myFileName.clear(); 00059 } 00060 00061 /// Destructor 00062 ~InputFile(); 00063 00064 /// Constructor for opening a file. 00065 /// \param filename file to open 00066 /// \param mode same format as fopen: "r" for read & "w" for write. 00067 /// \param compressionMode set the type of file to open for writing or 00068 /// for reading from stdin (when reading files, the compression type is 00069 /// determined by reading the file). 00070 InputFile(const char * filename, const char * mode, 00071 InputFile::ifileCompression compressionMode = InputFile::DEFAULT); 00072 00073 /// Set the buffer size for reading from files so that bufferSize bytes 00074 /// are read at a time and stored until accessed by another read call. 00075 /// This improves performance over reading the file small bits at a time. 00076 /// Buffering reads disables the tell call for bgzf files. 00077 /// Any previous values in the buffer will be deleted. 00078 /// \param bufferSize number of bytes to read/buffer at a time, 00079 /// default buffer size is 1048576, and turn off read buffering by setting 00080 /// bufferSize = 1; 00081 inline void bufferReads(unsigned int bufferSize = DEFAULT_BUFFER_SIZE) 00082 { 00083 // If the buffer size is the same, do nothing. 00084 if(bufferSize == myAllocatedBufferSize) 00085 { 00086 return; 00087 } 00088 // Delete the previous buffer. 00089 if(myFileBuffer != NULL) 00090 { 00091 delete[] myFileBuffer; 00092 } 00093 myBufferIndex = 0; 00094 myCurrentBufferSize = 0; 00095 // The buffer size must be at least 1 so one character can be 00096 // read and ifgetc can just assume reading into the buffer. 00097 if(bufferSize < 1) 00098 { 00099 bufferSize = 1; 00100 } 00101 myFileBuffer = new char[bufferSize]; 00102 myAllocatedBufferSize = bufferSize; 00103 00104 if(myFileTypePtr != NULL) 00105 { 00106 if(bufferSize == 1) 00107 { 00108 myFileTypePtr->setBuffered(false); 00109 } 00110 else 00111 { 00112 myFileTypePtr->setBuffered(true); 00113 } 00114 } 00115 } 00116 00117 00118 /// Disable read buffering. 00119 inline void disableBuffering() 00120 { 00121 bufferReads(1); 00122 if(myFileTypePtr != NULL) 00123 { 00124 myFileTypePtr->setBuffered(false); 00125 } 00126 } 00127 00128 00129 /// Close the file. 00130 /// \return status of the close (0 is success). 00131 inline int ifclose() 00132 { 00133 if (myFileTypePtr == NULL) 00134 { 00135 return EOF; 00136 } 00137 int result = myFileTypePtr->close(); 00138 delete myFileTypePtr; 00139 myFileTypePtr = NULL; 00140 myFileName.clear(); 00141 return result; 00142 } 00143 00144 /// Read size bytes from the file into the buffer. 00145 /// \param buffer pointer to memory at least size bytes big to write the 00146 /// data into. 00147 /// \param size number of bytes to be read 00148 /// \return number of bytes read 00149 inline int ifread(void * buffer, unsigned int size) 00150 { 00151 // There are 2 cases: 00152 // 1) There are already size available bytes in buffer. 00153 // 2) There are not size bytes in buffer. 00154 00155 // Determine the number of available bytes in the buffer. 00156 unsigned int availableBytes = myCurrentBufferSize - myBufferIndex; 00157 unsigned int returnSize = 0; 00158 00159 // Case 1: There are already size available bytes in buffer. 00160 if (size <= availableBytes) 00161 { 00162 // Just copy from the buffer, increment the index and return. 00163 memcpy(buffer, myFileBuffer+myBufferIndex, size); 00164 // Increment the buffer index. 00165 myBufferIndex += size; 00166 returnSize = size; 00167 } 00168 // Case 2: There are not size bytes in buffer. 00169 else 00170 { 00171 // Check to see if there are some bytes in the buffer. 00172 if (availableBytes > 0) 00173 { 00174 // Size > availableBytes > 0 00175 // Copy the available bytes into the buffer. 00176 memcpy(buffer, myFileBuffer+myBufferIndex, availableBytes); 00177 } 00178 unsigned int remainingSize = size - availableBytes; 00179 00180 // Check if the remaining size is more or less than the 00181 // max buffer size. 00182 if(remainingSize < myAllocatedBufferSize) 00183 { 00184 // the remaining size is not the full buffer, but read 00185 // a full buffer worth of data anyway. 00186 myCurrentBufferSize = 00187 readFromFile(myFileBuffer, myAllocatedBufferSize); 00188 00189 // Check to see how much was copied. 00190 unsigned int copySize = remainingSize; 00191 if(copySize > myCurrentBufferSize) 00192 { 00193 copySize = myCurrentBufferSize; 00194 } 00195 00196 // Now copy the rest of the bytes into the buffer. 00197 memcpy((char*)buffer+availableBytes, myFileBuffer, copySize); 00198 00199 // set the buffer index to the location after what we read. 00200 myBufferIndex = copySize; 00201 00202 returnSize = availableBytes + copySize; 00203 } 00204 else 00205 { 00206 // More remaining to be read than the max buffer size, so just 00207 // read directly into the output buffer. 00208 int readSize = readFromFile((char*)buffer + availableBytes, 00209 remainingSize); 00210 returnSize = readSize + availableBytes; 00211 } 00212 } 00213 return(returnSize); 00214 } 00215 00216 00217 /// Get a character from the file. Read a character from the internal 00218 /// buffer, or if the end of the buffer has been reached, read from the 00219 /// file into the buffer and return index 0. 00220 /// \return character that was read or EOF. 00221 inline int ifgetc() 00222 { 00223 if (myBufferIndex >= myCurrentBufferSize) 00224 { 00225 // at the last index, read a new buffer. 00226 myCurrentBufferSize = readFromFile(myFileBuffer, myAllocatedBufferSize); 00227 myBufferIndex = 0; 00228 } 00229 // If the buffer index is still greater than or equal to the 00230 // myCurrentBufferSize, then we failed to read the file - return EOF. 00231 if (myBufferIndex >= myCurrentBufferSize) 00232 { 00233 return(EOF); 00234 } 00235 return(myFileBuffer[myBufferIndex++]); 00236 } 00237 00238 /// Reset to the beginning of the file. 00239 inline void ifrewind() 00240 { 00241 // Just set the myBufferIndex and the myCurrentBufferSize to 0 to simulate 00242 // clearing the buffer and call rewind to move to the beginning of the 00243 // file. 00244 if (myFileTypePtr == NULL) 00245 { 00246 // No pointer, so nothing to rewind. 00247 return; 00248 } 00249 myCurrentBufferSize = 0; 00250 myBufferIndex = 0; 00251 myFileTypePtr->rewind(); 00252 } 00253 00254 00255 /// Check to see if we have reached the EOF. 00256 /// \return 0 if not EOF, any other value means EOF. 00257 inline int ifeof() 00258 { 00259 // Not EOF if we are not at the end of the buffer. 00260 if (myBufferIndex < myCurrentBufferSize) 00261 { 00262 // There are still available bytes in the buffer, so NOT EOF. 00263 return false; 00264 } 00265 else 00266 { 00267 if (myFileTypePtr == NULL) 00268 { 00269 // No myFileTypePtr, so not eof (return 0). 00270 return 0; 00271 } 00272 // exhausted our buffer, so check the file for eof. 00273 return myFileTypePtr->eof(); 00274 } 00275 } 00276 00277 /// Write the specified buffer into the file. 00278 /// \param buffer buffer containing size bytes to write to the file. 00279 /// \param size number of bytes to write 00280 /// \return number of bytes written 00281 /// We do not buffer the write call, so just leave this as normal. 00282 inline unsigned int ifwrite(const void * buffer, unsigned int size) 00283 { 00284 if (myFileTypePtr == NULL) 00285 { 00286 // No myFileTypePtr, so return 0 - nothing written. 00287 return 0; 00288 } 00289 return myFileTypePtr->write(buffer, size); 00290 } 00291 00292 /// Returns whether or not the file was successfully opened. 00293 /// \return true if the file is open, false if not. 00294 inline bool isOpen() 00295 { 00296 // It is open if the myFileTypePtr is set and says it is open. 00297 if ((myFileTypePtr != NULL) && myFileTypePtr->isOpen()) 00298 { 00299 return true; 00300 } 00301 // File was not successfully opened. 00302 return false; 00303 } 00304 00305 /// Get current position in the file. 00306 /// \return current position in the file, -1 indicates an error. 00307 inline long int iftell() 00308 { 00309 if (myFileTypePtr == NULL) 00310 { 00311 // No myFileTypePtr, so return false - could not seek. 00312 return -1; 00313 } 00314 return myFileTypePtr->tell(); 00315 } 00316 00317 00318 /// Seek to the specified offset from the origin. 00319 /// \param offset offset into the file to move to (must be from a tell call) 00320 /// \param origin can be any of the following: 00321 /// Note: not all are valid for all filetypes. 00322 /// SEEK_SET - Beginning of file 00323 /// SEEK_CUR - Current position of the file pointer 00324 /// SEEK_END - End of file 00325 /// \return true on successful seek and false on a failed seek. 00326 inline bool ifseek(long int offset, int origin) 00327 { 00328 if (myFileTypePtr == NULL) 00329 { 00330 // No myFileTypePtr, so return false - could not seek. 00331 return false; 00332 } 00333 // Reset buffering since a seek is being done. 00334 myBufferIndex = 0; 00335 myCurrentBufferSize = 0; 00336 return myFileTypePtr->seek(offset, origin); 00337 } 00338 00339 /// Get the filename that is currently opened. 00340 /// \return filename associated with this class 00341 const char* getFileName() const 00342 { 00343 return(myFileName.c_str()); 00344 } 00345 00346 protected: 00347 // Open a file. Called by the constructor. 00348 // Returns true if the file was successfully opened, false otherwise. 00349 bool openFile(const char * filename, const char * mode, 00350 InputFile::ifileCompression compressionMode); 00351 00352 // Read into a buffer from the file. Since the buffer is passed in and 00353 // this would bypass the myFileBuffer used by this class, this method must 00354 // be protected. 00355 inline int readFromFile(void * buffer, unsigned int size) 00356 { 00357 // If no myFileTypePtr, return 0 - nothing read. 00358 if (myFileTypePtr == NULL) 00359 { 00360 return 0; 00361 } 00362 return myFileTypePtr->read(buffer, size); 00363 } 00364 00365 #ifdef __ZLIB_AVAILABLE__ 00366 // Only necessary with zlib to determine what file type on a new 00367 // file. Without zlib, there are only uncompressed files, so a special 00368 // method is not needed to determine the type of file to open. 00369 // Open a file. This method will open a file with the specified name and 00370 // mode with the fileTypePtr associated with the specified compressionMode. 00371 void openFileUsingMode(const char* filename, const char* mode, 00372 InputFile::ifileCompression compressionMode); 00373 #endif 00374 00375 // The size of the buffer used by this class. 00376 static const unsigned int DEFAULT_BUFFER_SIZE = 1048576; 00377 00378 // Pointer to a class that interfaces with different file types. 00379 FileType* myFileTypePtr; 00380 00381 unsigned int myAllocatedBufferSize; 00382 00383 // Buffer used to do large reads rather than 1 by 1 character reads 00384 // from the file. The class is then managed to iterate through the buffer. 00385 char* myFileBuffer; 00386 00387 // Current index into the buffer. Used to track where we are in reading the 00388 // file from the buffer. 00389 unsigned int myBufferIndex; 00390 00391 // Current number of entries in the buffer. Used to ensure that 00392 // if a read did not fill the buffer, we stop before hitting the 00393 // end of what was read. 00394 unsigned int myCurrentBufferSize; 00395 00396 std::string myFileName; 00397 }; 00398 00399 00400 /// Define IFILE as a pointer to an InputFile object. 00401 typedef InputFile* IFILE; 00402 00403 00404 /// Open a file. 00405 /// \param filename file to open 00406 /// \param mode same format as fopen: "r" for read & "w" for write. 00407 /// \param compressionMode set the type of file to open for writing or 00408 /// for reading from stdin (when reading files, the compression type is 00409 /// determined by reading the file). 00410 /// \return IFILE - pointer to the InputFile object that has been opened. 00411 inline IFILE ifopen(const char * filename, const char * mode, 00412 InputFile::ifileCompression compressionMode = InputFile::DEFAULT) 00413 { 00414 IFILE file = new InputFile(filename, mode, compressionMode); 00415 if (!file->isOpen()) 00416 { 00417 00418 // Not open, so delete the file, and return null. 00419 delete file; 00420 file = NULL; 00421 } 00422 return file; 00423 } 00424 00425 00426 /// Close the file. 00427 /// \param file file to be closed - IFILE is a pointer to an InputFile object 00428 /// \return status of the close (0 is success). 00429 inline int ifclose(IFILE file) 00430 { 00431 int result = file->ifclose(); 00432 delete file; 00433 file = NULL; 00434 return(result); 00435 } 00436 00437 /// Read size bytes from the file into the buffer. 00438 /// \param file file to be read - IFILE is a pointer to an InputFile object 00439 /// \param buffer pointer to memory at least size bytes big to write the 00440 /// data into. 00441 /// \param size number of bytes to be read 00442 /// \return number of bytes read 00443 inline unsigned int ifread(IFILE file, void * buffer, unsigned int size) 00444 { 00445 return(file->ifread(buffer, size)); 00446 } 00447 00448 /// Get a character from the file. Read a character from the internal 00449 /// buffer, or if the end of the buffer has been reached, read from the 00450 /// file into the buffer and return index 0. 00451 /// \param file file to be read - IFILE is a pointer to an InputFile object 00452 /// \return character that was read or EOF. 00453 inline int ifgetc(IFILE file) 00454 { 00455 return(file->ifgetc()); 00456 } 00457 00458 /// Reset to the beginning of the file. 00459 /// \param file file to be rewound - IFILE is a pointer to an InputFile object 00460 inline void ifrewind(IFILE file) 00461 { 00462 file->ifrewind(); 00463 } 00464 00465 /// Check to see if we have reached the EOF. 00466 /// \param file file to be checked - IFILE is a pointer to an InputFile object 00467 /// \return 0 if not EOF, any other value means EOF. 00468 inline int ifeof(IFILE file) 00469 { 00470 return(file->ifeof()); 00471 } 00472 00473 /// Write the specified buffer into the file. 00474 /// \param file file to write to - IFILE is a pointer to an InputFile object 00475 /// \param buffer buffer containing size bytes to write to the file. 00476 /// \param size number of bytes to write 00477 /// \return number of bytes written 00478 inline unsigned int ifwrite(IFILE file, const void * buffer, unsigned int size) 00479 { 00480 return(file->ifwrite(buffer, size)); 00481 } 00482 00483 /// Get current position in the file. 00484 /// \param file file to perform tell on - IFILE is a pointer to an InputFile object 00485 /// \return current position in the file, -1 indicates an error. 00486 inline long int iftell(IFILE file) 00487 { 00488 return (file->iftell()); 00489 } 00490 00491 /// Seek to the specified offset from the origin. 00492 /// \param file file to perform seek on - IFILE is a pointer to an InputFile object 00493 /// \param offset offset into the file to move to (must be from a tell call) 00494 /// \param origin can be any of the following: 00495 /// Note: not all are valid for all filetypes. 00496 /// SEEK_SET - Beginning of file 00497 /// SEEK_CUR - Current position of the file pointer 00498 /// SEEK_END - End of file 00499 /// \return true on successful seek and false on a failed seek. 00500 inline bool ifseek(IFILE file, long int offset, int origin) 00501 { 00502 return (file->ifseek(offset, origin)); 00503 } 00504 00505 /// Write to a file using fprintf format. 00506 /// \param file file to write to - IFILE is a pointer to an InputFile object 00507 /// \param format printf format for writing, followed by parameters. 00508 /// \return number of bytes written 00509 int ifprintf(IFILE output, char * format, ...); 00510 00511 /// Read a line from a file using streaming. 00512 /// \param stream file to read from - IFILE is a pointer to an InputFile object 00513 /// \param str output string containing the line read from the file. 00514 inline IFILE operator >> (IFILE stream, std::string &str) 00515 { 00516 str.clear(); 00517 int ch; 00518 // not safe... newline handling? 00519 while ((ch = stream->ifgetc())!=EOF && (ch != '\n')) str.push_back(ch); 00520 return stream; 00521 } 00522 00523 #endif 00524
1.6.3