InputFile.h

Go to the documentation of this file.
00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 /*! \file */ 
00018 #ifndef __INPUTFILE_H__
00019 #define __INPUTFILE_H__
00020 
00021 #ifdef  __gnu_linux__
00022 #ifndef __ZLIB_AVAILABLE__
00023 #define __ZLIB_AVAILABLE__
00024 #endif
00025 #endif
00026 
00027 #include <stdio.h>
00028 #include <iostream>
00029 #include <cstring>
00030 
00031 #include "FileType.h"
00032 
00033 /// Class for easily reading/writing files without having to worry about
00034 /// file type (uncompressed, gzip, bgzf) when reading.
00035 class InputFile
00036 {
00037 public:
00038 
00039     /// Compression to use when writing a file & decompression used when
00040     /// reading a file from stdin.  Any other read checks the file to determine
00041     ///  how to uncompress it.
00042     enum ifileCompression {
00043         DEFAULT,  ///< Check the extension, if it is ".gz", treat as gzip, otherwise treat it as UNCOMPRESSED.
00044         UNCOMPRESSED,  ///< uncompressed file.
00045         GZIP,  ///< gzip file.
00046         BGZF ///< bgzf file.
00047     };
00048 
00049     /// Default constructor
00050     InputFile()
00051     {
00052         myFileTypePtr = NULL;
00053         myBufferIndex = 0;
00054         myCurrentBufferSize = 0;
00055         // Default to buffer.
00056         myAllocatedBufferSize = DEFAULT_BUFFER_SIZE;
00057         myFileBuffer = new char[myAllocatedBufferSize];
00058         myFileName.clear();
00059     }
00060 
00061     /// Destructor
00062     ~InputFile();
00063 
00064     /// Constructor for opening a file.
00065     /// \param filename file to open
00066     /// \param mode same format as fopen: "r" for read & "w" for write.
00067     /// \param compressionMode set the type of file to open for writing or
00068     /// for reading from stdin (when reading files, the compression type is 
00069     /// determined by reading the file).
00070     InputFile(const char * filename, const char * mode,
00071               InputFile::ifileCompression compressionMode = InputFile::DEFAULT);
00072 
00073     /// Set the buffer size for reading from files so that bufferSize bytes
00074     /// are read at a time and stored until accessed by another read call.
00075     /// This improves performance over reading the file small bits at a time.
00076     /// Buffering reads disables the tell call for bgzf files.
00077     /// Any previous values in the buffer will be deleted.
00078     /// \param bufferSize number of bytes to read/buffer at a time,
00079     /// default buffer size is 1048576, and turn off read buffering by setting
00080     /// bufferSize = 1;
00081     inline void bufferReads(unsigned int bufferSize = DEFAULT_BUFFER_SIZE)
00082     {
00083         // If the buffer size is the same, do nothing.
00084         if(bufferSize == myAllocatedBufferSize)
00085         {
00086             return;
00087         }
00088         // Delete the previous buffer.
00089         if(myFileBuffer != NULL)
00090         {
00091             delete[] myFileBuffer;
00092         }
00093         myBufferIndex = 0;
00094         myCurrentBufferSize = 0;
00095         // The buffer size must be at least 1 so one character can be
00096         // read and ifgetc can just assume reading into the buffer.
00097         if(bufferSize < 1)
00098         {
00099             bufferSize = 1;
00100         }
00101         myFileBuffer = new char[bufferSize];
00102         myAllocatedBufferSize = bufferSize;
00103 
00104         if(myFileTypePtr != NULL)
00105         {
00106             if(bufferSize == 1)
00107             {
00108                 myFileTypePtr->setBuffered(false);
00109             }
00110             else
00111             {
00112                 myFileTypePtr->setBuffered(true);
00113             }
00114         }
00115     }
00116 
00117 
00118     /// Disable read buffering.
00119     inline void disableBuffering()
00120     {
00121         bufferReads(1);
00122         if(myFileTypePtr != NULL)
00123         {
00124             myFileTypePtr->setBuffered(false);
00125         }
00126     }
00127 
00128     
00129     /// Close the file.
00130     /// \return status of the close (0 is success).
00131     inline int ifclose()
00132     {
00133         if (myFileTypePtr == NULL)
00134         {
00135             return EOF;
00136         }
00137         int result = myFileTypePtr->close();
00138         delete myFileTypePtr;
00139         myFileTypePtr = NULL;
00140         myFileName.clear();
00141         return result;
00142     }
00143 
00144     /// Read size bytes from the file into the buffer.
00145     /// \param buffer pointer to memory at least size bytes big to write the
00146     /// data into.
00147     /// \param size number of bytes to be read
00148     /// \return number of bytes read
00149     inline int ifread(void * buffer, unsigned int size)
00150     {
00151         // There are 2 cases:
00152         //  1) There are already size available bytes in buffer.
00153         //  2) There are not size bytes in buffer.
00154 
00155         // Determine the number of available bytes in the buffer.
00156         unsigned int availableBytes = myCurrentBufferSize - myBufferIndex;
00157         unsigned int returnSize = 0;
00158 
00159         // Case 1: There are already size available bytes in buffer.
00160         if (size <= availableBytes)
00161         {
00162             //   Just copy from the buffer, increment the index and return.
00163             memcpy(buffer, myFileBuffer+myBufferIndex, size);
00164             // Increment the buffer index.
00165             myBufferIndex += size;
00166             returnSize = size;
00167         }
00168         // Case 2: There are not size bytes in buffer.
00169         else
00170         {
00171             // Check to see if there are some bytes in the buffer.
00172             if (availableBytes > 0)
00173             {
00174                 // Size > availableBytes > 0
00175                 // Copy the available bytes into the buffer.
00176                 memcpy(buffer, myFileBuffer+myBufferIndex, availableBytes);
00177             }
00178             unsigned int remainingSize = size - availableBytes;
00179 
00180             // Check if the remaining size is more or less than the
00181             // max buffer size.
00182             if(remainingSize < myAllocatedBufferSize)
00183             {
00184                 // the remaining size is not the full buffer, but read
00185                 //  a full buffer worth of data anyway.
00186                 myCurrentBufferSize =
00187                     readFromFile(myFileBuffer, myAllocatedBufferSize);
00188                 
00189                 // Check to see how much was copied.
00190                 unsigned int copySize = remainingSize;
00191                 if(copySize > myCurrentBufferSize)
00192                 {
00193                     copySize = myCurrentBufferSize;
00194                 }
00195 
00196                 // Now copy the rest of the bytes into the buffer.
00197                 memcpy((char*)buffer+availableBytes, myFileBuffer, copySize);
00198 
00199                 // set the buffer index to the location after what we read.
00200                 myBufferIndex = copySize;
00201                 
00202                 returnSize = availableBytes + copySize;
00203             }
00204             else
00205             {
00206                 // More remaining to be read than the max buffer size, so just
00207                 // read directly into the output buffer.
00208                 int readSize = readFromFile((char*)buffer + availableBytes,
00209                                             remainingSize);
00210                 returnSize = readSize + availableBytes;
00211             }
00212         }
00213         return(returnSize);
00214     }
00215 
00216 
00217     /// Get a character from the file.  Read a character from the internal
00218     /// buffer, or if the end of the buffer has been reached, read from the
00219     /// file into the buffer and return index 0.
00220     /// \return character that was read or EOF.
00221     inline int ifgetc()
00222     {
00223         if (myBufferIndex >= myCurrentBufferSize)
00224         {
00225             // at the last index, read a new buffer.
00226             myCurrentBufferSize = readFromFile(myFileBuffer, myAllocatedBufferSize);
00227             myBufferIndex = 0;
00228         }
00229         // If the buffer index is still greater than or equal to the
00230         // myCurrentBufferSize, then we failed to read the file - return EOF.
00231         if (myBufferIndex >= myCurrentBufferSize)
00232         {
00233             return(EOF);
00234         }
00235         return(myFileBuffer[myBufferIndex++]);
00236     }
00237 
00238     /// Reset to the beginning of the file.
00239     inline void ifrewind()
00240     {
00241         // Just set the myBufferIndex and the myCurrentBufferSize to 0 to simulate
00242         // clearing the buffer and call rewind to move to the beginning of the
00243         // file.
00244         if (myFileTypePtr == NULL)
00245         {
00246             // No pointer, so nothing to rewind.
00247             return;
00248         }
00249         myCurrentBufferSize = 0;
00250         myBufferIndex = 0;
00251         myFileTypePtr->rewind();
00252     }
00253 
00254 
00255     /// Check to see if we have reached the EOF.
00256     /// \return 0 if not EOF, any other value means EOF.
00257     inline int ifeof()
00258     {
00259         // Not EOF if we are not at the end of the buffer.
00260         if (myBufferIndex < myCurrentBufferSize)
00261         {
00262             // There are still available bytes in the buffer, so NOT EOF.
00263             return false;
00264         }
00265         else
00266         {
00267             if (myFileTypePtr == NULL)
00268             {
00269                 // No myFileTypePtr, so not eof (return 0).
00270                 return 0;
00271             }
00272             // exhausted our buffer, so check the file for eof.
00273             return myFileTypePtr->eof();
00274         }
00275     }
00276 
00277     /// Write the specified buffer into the file.
00278     /// \param buffer buffer containing size bytes to write to the file.
00279     /// \param size number of bytes to write
00280     /// \return number of bytes written
00281     /// We do not buffer the write call, so just leave this as normal.
00282     inline unsigned int ifwrite(const void * buffer, unsigned int size)
00283     {
00284         if (myFileTypePtr == NULL)
00285         {
00286             // No myFileTypePtr, so return 0 - nothing written.
00287             return 0;
00288         }
00289         return myFileTypePtr->write(buffer, size);
00290     }
00291 
00292     /// Returns whether or not the file was successfully opened.
00293     /// \return true if the file is open, false if not.
00294     inline bool isOpen()
00295     {
00296         // It is open if the myFileTypePtr is set and says it is open.
00297         if ((myFileTypePtr != NULL) && myFileTypePtr->isOpen())
00298         {
00299             return true;
00300         }
00301         // File was not successfully opened.
00302         return false;
00303     }
00304 
00305     /// Get current position in the file.
00306     /// \return current position in the file, -1 indicates an error.
00307     inline long int iftell()
00308     {
00309         if (myFileTypePtr == NULL)
00310         {
00311             // No myFileTypePtr, so return false - could not seek.
00312             return -1;
00313         }
00314         return myFileTypePtr->tell();
00315     }
00316 
00317 
00318     /// Seek to the specified offset from the origin.
00319     /// \param offset offset into the file to move to (must be from a tell call)
00320     /// \param origin can be any of the following:
00321     /// Note: not all are valid for all filetypes.
00322     ///   SEEK_SET - Beginning of file
00323     ///   SEEK_CUR - Current position of the file pointer
00324     ///   SEEK_END - End of file
00325     /// \return true on successful seek and false on a failed seek.
00326     inline bool ifseek(long int offset, int origin)
00327     {
00328         if (myFileTypePtr == NULL)
00329         {
00330             // No myFileTypePtr, so return false - could not seek.
00331             return false;
00332         }
00333         // Reset buffering since a seek is being done.
00334         myBufferIndex = 0;
00335         myCurrentBufferSize = 0;
00336         return myFileTypePtr->seek(offset, origin);
00337     }
00338 
00339     /// Get the filename that is currently opened.
00340     /// \return filename associated with this class
00341     const char* getFileName() const
00342     {
00343         return(myFileName.c_str());
00344     }
00345 
00346 protected:
00347     // Open a file. Called by the constructor.
00348     // Returns true if the file was successfully opened, false otherwise.
00349     bool openFile(const char * filename, const char * mode,
00350                   InputFile::ifileCompression compressionMode);
00351 
00352     // Read into a buffer from the file.  Since the buffer is passed in and
00353     // this would bypass the myFileBuffer used by this class, this method must
00354     // be protected.
00355     inline int readFromFile(void * buffer, unsigned int size)
00356     {
00357         // If no myFileTypePtr, return 0 - nothing read.
00358         if (myFileTypePtr == NULL)
00359         {
00360             return 0;
00361         }
00362         return myFileTypePtr->read(buffer, size);
00363     }
00364 
00365 #ifdef __ZLIB_AVAILABLE__
00366     // Only necessary with zlib to determine what file type on a new
00367     // file.  Without zlib, there are only uncompressed files, so a special
00368     // method is not needed to determine the type of file to open.
00369     // Open a file.  This method will open a file with the specified name and
00370     // mode with the fileTypePtr associated with the specified compressionMode.
00371     void openFileUsingMode(const char* filename, const char* mode,
00372                            InputFile::ifileCompression compressionMode);
00373 #endif
00374 
00375     // The size of the buffer used by this class.
00376     static const unsigned int DEFAULT_BUFFER_SIZE = 1048576;
00377 
00378     // Pointer to a class that interfaces with different file types.
00379     FileType* myFileTypePtr;
00380 
00381     unsigned int myAllocatedBufferSize;
00382 
00383     // Buffer used to do large reads rather than 1 by 1 character reads
00384     // from the file.  The class is then managed to iterate through the buffer.
00385     char* myFileBuffer;
00386 
00387     // Current index into the buffer.  Used to track where we are in reading the
00388     // file from the buffer.
00389     unsigned int myBufferIndex;
00390 
00391     // Current number of entries in the buffer.  Used to ensure that
00392     // if a read did not fill the buffer, we stop before hitting the
00393     // end of what was read.
00394     unsigned int myCurrentBufferSize;
00395 
00396     std::string myFileName;
00397 };
00398 
00399 
00400 /// Define IFILE as a pointer to an InputFile object.
00401 typedef InputFile* IFILE;
00402 
00403 
00404 /// Open a file.
00405 /// \param filename file to open
00406 /// \param mode same format as fopen: "r" for read & "w" for write.
00407 /// \param compressionMode set the type of file to open for writing or
00408 /// for reading from stdin (when reading files, the compression type is 
00409 /// determined by reading the file).
00410 /// \return IFILE - pointer to the InputFile object that has been opened.
00411 inline IFILE ifopen(const char * filename, const char * mode,
00412                     InputFile::ifileCompression compressionMode = InputFile::DEFAULT)
00413 {
00414     IFILE file = new InputFile(filename, mode, compressionMode);
00415     if (!file->isOpen())
00416     {
00417 
00418         // Not open, so delete the file, and return null.
00419         delete file;
00420         file = NULL;
00421     }
00422     return file;
00423 }
00424 
00425 
00426 /// Close the file.
00427 /// \param file file to be closed - IFILE is a pointer to an InputFile object
00428 /// \return status of the close (0 is success).
00429 inline int ifclose(IFILE file)
00430 {
00431     int result = file->ifclose();
00432     delete file;
00433     file = NULL;
00434     return(result);
00435 }
00436 
00437 /// Read size bytes from the file into the buffer.
00438 /// \param file file to be read - IFILE is a pointer to an InputFile object
00439 /// \param buffer pointer to memory at least size bytes big to write the
00440 /// data into.
00441 /// \param size number of bytes to be read
00442 /// \return number of bytes read
00443 inline unsigned int ifread(IFILE file, void * buffer, unsigned int size)
00444 {
00445     return(file->ifread(buffer, size));
00446 }
00447 
00448 /// Get a character from the file.  Read a character from the internal
00449 /// buffer, or if the end of the buffer has been reached, read from the
00450 /// file into the buffer and return index 0.
00451 /// \param file file to be read - IFILE is a pointer to an InputFile object
00452 /// \return character that was read or EOF.
00453 inline int ifgetc(IFILE file)
00454 {
00455     return(file->ifgetc());
00456 }
00457 
00458 /// Reset to the beginning of the file.
00459 /// \param file file to be rewound - IFILE is a pointer to an InputFile object
00460 inline void ifrewind(IFILE file)
00461 {
00462     file->ifrewind();
00463 }
00464 
00465 /// Check to see if we have reached the EOF.
00466 /// \param file file to be checked - IFILE is a pointer to an InputFile object
00467 /// \return 0 if not EOF, any other value means EOF.
00468 inline int ifeof(IFILE file)
00469 {
00470     return(file->ifeof());
00471 }
00472 
00473 /// Write the specified buffer into the file.
00474 /// \param file file to write to - IFILE is a pointer to an InputFile object
00475 /// \param buffer buffer containing size bytes to write to the file.
00476 /// \param size number of bytes to write
00477 /// \return number of bytes written
00478 inline unsigned int ifwrite(IFILE file, const void * buffer, unsigned int size)
00479 {
00480     return(file->ifwrite(buffer, size));
00481 }
00482 
00483 /// Get current position in the file.
00484 /// \param file file to perform tell on - IFILE is a pointer to an InputFile object
00485 /// \return current position in the file, -1 indicates an error.
00486 inline long int iftell(IFILE file)
00487 {
00488     return (file->iftell());
00489 }
00490 
00491 /// Seek to the specified offset from the origin.
00492 /// \param file file to perform seek on - IFILE is a pointer to an InputFile object
00493 /// \param offset offset into the file to move to (must be from a tell call)
00494 /// \param origin can be any of the following:
00495 /// Note: not all are valid for all filetypes.
00496 ///   SEEK_SET - Beginning of file
00497 ///   SEEK_CUR - Current position of the file pointer
00498 ///   SEEK_END - End of file
00499 /// \return true on successful seek and false on a failed seek.
00500 inline bool ifseek(IFILE file, long int offset, int origin)
00501 {
00502     return (file->ifseek(offset, origin));
00503 }
00504 
00505 /// Write to a file using fprintf format.
00506 /// \param file file to write to - IFILE is a pointer to an InputFile object
00507 /// \param format printf format for writing, followed by parameters.
00508 /// \return number of bytes written
00509 int ifprintf(IFILE output, char * format, ...);
00510 
00511 /// Read a line from a file using streaming.
00512 /// \param stream file to read from - IFILE is a pointer to an InputFile object
00513 /// \param str output string containing the line read from the file.
00514 inline IFILE operator >> (IFILE stream, std::string &str)
00515 {
00516     str.clear();
00517     int ch;
00518     // not safe... newline handling?
00519     while ((ch = stream->ifgetc())!=EOF && (ch != '\n')) str.push_back(ch);
00520     return stream;
00521 }
00522 
00523 #endif
00524 
Generated on Tue Mar 22 22:50:18 2011 for StatGen Software by  doxygen 1.6.3