InputFile.h

Go to the documentation of this file.
00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 /*! \file */ 
00018 #ifndef __INPUTFILE_H__
00019 #define __INPUTFILE_H__
00020 
00021 #ifdef  __gnu_linux__
00022 #ifndef __ZLIB_AVAILABLE__
00023 #define __ZLIB_AVAILABLE__
00024 #endif
00025 #endif
00026 
00027 #include <stdio.h>
00028 #include <iostream>
00029 #include <cstring>
00030 #include <stdint.h>
00031 
00032 #include "FileType.h"
00033 
00034 /// Class for easily reading/writing files without having to worry about
00035 /// file type (uncompressed, gzip, bgzf) when reading.
00036 class InputFile
00037 {
00038     bool    myAttemptRecovery;  // use recovery techniques if possible
00039 public:
00040 
00041     /// Compression to use when writing a file & decompression used when
00042     /// reading a file from stdin.  Any other read checks the file to determine
00043     ///  how to uncompress it.
00044     enum ifileCompression {
00045         DEFAULT,  ///< Check the extension, if it is ".gz", treat as gzip, otherwise treat it as UNCOMPRESSED.
00046         UNCOMPRESSED,  ///< uncompressed file.
00047         GZIP,  ///< gzip file.
00048         BGZF ///< bgzf file.
00049     };
00050 
00051     /// Default constructor
00052     InputFile()
00053     {
00054         myAttemptRecovery = false;
00055         myFileTypePtr = NULL;
00056         myBufferIndex = 0;
00057         myCurrentBufferSize = 0;
00058         // Default to buffer.
00059         myAllocatedBufferSize = DEFAULT_BUFFER_SIZE;
00060         myFileBuffer = new char[myAllocatedBufferSize];
00061         myFileName.clear();
00062     }
00063 
00064     /// Destructor
00065     ~InputFile();
00066 
00067     /// Constructor for opening a file.
00068     /// \param filename file to open
00069     /// \param mode same format as fopen: "r" for read & "w" for write.
00070     /// \param compressionMode set the type of file to open for writing or
00071     /// for reading from stdin (when reading files, the compression type is 
00072     /// determined by reading the file).
00073     InputFile(const char * filename, const char * mode,
00074               InputFile::ifileCompression compressionMode = InputFile::DEFAULT);
00075 
00076     /// Set the buffer size for reading from files so that bufferSize bytes
00077     /// are read at a time and stored until accessed by another read call.
00078     /// This improves performance over reading the file small bits at a time.
00079     /// Buffering reads disables the tell call for bgzf files.
00080     /// Any previous values in the buffer will be deleted.
00081     /// \param bufferSize number of bytes to read/buffer at a time,
00082     /// default buffer size is 1048576, and turn off read buffering by setting
00083     /// bufferSize = 1;
00084     inline void bufferReads(unsigned int bufferSize = DEFAULT_BUFFER_SIZE)
00085     {
00086         // If the buffer size is the same, do nothing.
00087         if(bufferSize == myAllocatedBufferSize)
00088         {
00089             return;
00090         }
00091         // Delete the previous buffer.
00092         if(myFileBuffer != NULL)
00093         {
00094             delete[] myFileBuffer;
00095         }
00096         myBufferIndex = 0;
00097         myCurrentBufferSize = 0;
00098         // The buffer size must be at least 1 so one character can be
00099         // read and ifgetc can just assume reading into the buffer.
00100         if(bufferSize < 1)
00101         {
00102             bufferSize = 1;
00103         }
00104         myFileBuffer = new char[bufferSize];
00105         myAllocatedBufferSize = bufferSize;
00106 
00107         if(myFileTypePtr != NULL)
00108         {
00109             if(bufferSize == 1)
00110             {
00111                 myFileTypePtr->setBuffered(false);
00112             }
00113             else
00114             {
00115                 myFileTypePtr->setBuffered(true);
00116             }
00117         }
00118     }
00119 
00120 
00121     /// Disable read buffering.
00122     inline void disableBuffering()
00123     {
00124         bufferReads(1);
00125         if(myFileTypePtr != NULL)
00126         {
00127             myFileTypePtr->setBuffered(false);
00128         }
00129     }
00130 
00131     
00132     /// Close the file.
00133     /// \return status of the close (0 is success).
00134     inline int ifclose()
00135     {
00136         if (myFileTypePtr == NULL)
00137         {
00138             return EOF;
00139         }
00140         int result = myFileTypePtr->close();
00141         delete myFileTypePtr;
00142         myFileTypePtr = NULL;
00143         myFileName.clear();
00144         return result;
00145     }
00146 
00147     /// Read size bytes from the file into the buffer.
00148     /// \param buffer pointer to memory at least size bytes big to write the
00149     /// data into.
00150     /// \param size number of bytes to be read
00151     /// \return number of bytes read, if it is not equal to size,
00152     /// there was either an error or the end of the file was reached, use
00153     /// ifeof to determine which case it was.
00154     inline int ifread(void * buffer, unsigned int size)
00155     {
00156         // There are 2 cases:
00157         //  1) There are already size available bytes in buffer.
00158         //  2) There are not size bytes in buffer.
00159 
00160         // Determine the number of available bytes in the buffer.
00161         unsigned int availableBytes = myCurrentBufferSize - myBufferIndex;
00162         int returnSize = 0;
00163 
00164         // Case 1: There are already size available bytes in buffer.
00165         if (size <= availableBytes)
00166         {
00167             //   Just copy from the buffer, increment the index and return.
00168             memcpy(buffer, myFileBuffer+myBufferIndex, size);
00169             // Increment the buffer index.
00170             myBufferIndex += size;
00171             returnSize = size;
00172         }
00173         // Case 2: There are not size bytes in buffer.
00174         else
00175         {
00176             // Check to see if there are some bytes in the buffer.
00177             if (availableBytes > 0)
00178             {
00179                 // Size > availableBytes > 0
00180                 // Copy the available bytes into the buffer.
00181                 memcpy(buffer, myFileBuffer+myBufferIndex, availableBytes);
00182             }
00183             // So far availableBytes have been copied into the read buffer.
00184             returnSize = availableBytes;
00185             // Increment myBufferIndex  by what was read.
00186             myBufferIndex += availableBytes;
00187 
00188             unsigned int remainingSize = size - availableBytes;
00189 
00190             // Check if the remaining size is more or less than the
00191             // max buffer size.
00192             if(remainingSize < myAllocatedBufferSize)
00193             {
00194                 // the remaining size is not the full buffer, but read
00195                 //  a full buffer worth of data anyway.
00196                 myCurrentBufferSize =
00197                     readFromFile(myFileBuffer, myAllocatedBufferSize);
00198 
00199                 // Check for an error.
00200                 if(myCurrentBufferSize <= 0)
00201                 {
00202                     // No more data was successfully read, so check to see
00203                     // if any data was copied to the return buffer at all.
00204                     if( returnSize == 0)
00205                     {
00206                         // No data has been copied at all into the
00207                         // return read buffer, so just return the value
00208                         // returned from readFromFile.
00209                         returnSize = myCurrentBufferSize;
00210                         // Otherwise, returnSize is already set to the
00211                         // available bytes that was already copied (so no
00212                         // else statement is needed).
00213                     }
00214                     // Set myBufferIndex & myCurrentBufferSize to 0.
00215                     myCurrentBufferSize = 0;
00216                     myBufferIndex = 0;
00217                 }
00218                 else
00219                 {
00220                     // Successfully read more data.
00221                     // Check to see how much was copied.
00222                     int copySize = remainingSize;
00223                     if(copySize > myCurrentBufferSize)
00224                     {
00225                         // Not the entire requested amount was read
00226                         // (either from EOF or there was a partial read due to
00227                         // an error), so set the copySize to what was read.
00228                         copySize = myCurrentBufferSize;
00229                     }
00230 
00231                     // Now copy the rest of the bytes into the buffer.
00232                     memcpy((char*)buffer+availableBytes, 
00233                            myFileBuffer, copySize);
00234 
00235                     // set the buffer index to the location after what we are
00236                     // returning as read.
00237                     myBufferIndex = copySize;
00238                 
00239                     returnSize += copySize;
00240                 }
00241             }
00242             else
00243             {
00244                 // More remaining to be read than the max buffer size, so just
00245                 // read directly into the output buffer.
00246                 int readSize = readFromFile((char*)buffer + availableBytes,
00247                                             remainingSize);
00248 
00249                 // Already used the buffer, so "clear" it.
00250                 myCurrentBufferSize = 0;
00251                 myBufferIndex = 0;
00252                 if(readSize <= 0)
00253                 {
00254                     // No more data was successfully read, so check to see
00255                     // if any data was copied to the return buffer at all.
00256                     if(returnSize == 0)
00257                     {
00258                         // No data has been copied at all into the
00259                         // return read buffer, so just return the value
00260                         // returned from readFromFile.
00261                         returnSize = readSize;
00262                         // Otherwise, returnSize is already set to the
00263                         // available bytes that was already copied (so no
00264                         // else statement is needed).
00265                     }
00266                 }
00267                 else
00268                 {
00269                     // More data was read, so increment the return count.
00270                     returnSize += readSize;
00271                 }
00272             }
00273         }
00274         return(returnSize);
00275     }
00276 
00277 
00278     /// Get a character from the file.  Read a character from the internal
00279     /// buffer, or if the end of the buffer has been reached, read from the
00280     /// file into the buffer and return index 0.
00281     /// \return character that was read or EOF.
00282     inline int ifgetc()
00283     {
00284         if (myBufferIndex >= myCurrentBufferSize)
00285         {
00286             // at the last index, read a new buffer.
00287             myCurrentBufferSize = readFromFile(myFileBuffer, myAllocatedBufferSize);
00288             myBufferIndex = 0;
00289         }
00290         // If the buffer index is still greater than or equal to the
00291         // myCurrentBufferSize, then we failed to read the file - return EOF.
00292         if (myBufferIndex >= myCurrentBufferSize)
00293         {
00294             return(EOF);
00295         }
00296         return(myFileBuffer[myBufferIndex++]);
00297     }
00298 
00299     /// Reset to the beginning of the file.
00300     inline void ifrewind()
00301     {
00302         // Just set the myBufferIndex and the myCurrentBufferSize to 0 to simulate
00303         // clearing the buffer and call rewind to move to the beginning of the
00304         // file.
00305         if (myFileTypePtr == NULL)
00306         {
00307             // No pointer, so nothing to rewind.
00308             return;
00309         }
00310         myCurrentBufferSize = 0;
00311         myBufferIndex = 0;
00312         myFileTypePtr->rewind();
00313     }
00314 
00315 
00316     /// Check to see if we have reached the EOF.
00317     /// \return 0 if not EOF, any other value means EOF.
00318     inline int ifeof()
00319     {
00320         // Not EOF if we are not at the end of the buffer.
00321         if (myBufferIndex < myCurrentBufferSize)
00322         {
00323             // There are still available bytes in the buffer, so NOT EOF.
00324             return false;
00325         }
00326         else
00327         {
00328             if (myFileTypePtr == NULL)
00329             {
00330                 // No myFileTypePtr, so not eof (return 0).
00331                 return 0;
00332             }
00333             // exhausted our buffer, so check the file for eof.
00334             return myFileTypePtr->eof();
00335         }
00336     }
00337 
00338     /// Write the specified buffer into the file.
00339     /// \param buffer buffer containing size bytes to write to the file.
00340     /// \param size number of bytes to write
00341     /// \return number of bytes written
00342     /// We do not buffer the write call, so just leave this as normal.
00343     inline unsigned int ifwrite(const void * buffer, unsigned int size)
00344     {
00345         if (myFileTypePtr == NULL)
00346         {
00347             // No myFileTypePtr, so return 0 - nothing written.
00348             return 0;
00349         }
00350         return myFileTypePtr->write(buffer, size);
00351     }
00352 
00353     /// Returns whether or not the file was successfully opened.
00354     /// \return true if the file is open, false if not.
00355     inline bool isOpen()
00356     {
00357         // It is open if the myFileTypePtr is set and says it is open.
00358         if ((myFileTypePtr != NULL) && myFileTypePtr->isOpen())
00359         {
00360             return true;
00361         }
00362         // File was not successfully opened.
00363         return false;
00364     }
00365 
00366     /// Get current position in the file.
00367     /// \return current position in the file, -1 indicates an error.
00368     inline int64_t iftell()
00369     {
00370         if (myFileTypePtr == NULL)
00371         {
00372             // No myFileTypePtr, so return false - could not seek.
00373             return -1;
00374         }
00375         int64_t pos = myFileTypePtr->tell();
00376         pos -= (myCurrentBufferSize - myBufferIndex);
00377         return(pos);
00378     }
00379 
00380 
00381     /// Seek to the specified offset from the origin.
00382     /// \param offset offset into the file to move to (must be from a tell call)
00383     /// \param origin can be any of the following:
00384     /// Note: not all are valid for all filetypes.
00385     ///   SEEK_SET - Beginning of file
00386     ///   SEEK_CUR - Current position of the file pointer
00387     ///   SEEK_END - End of file
00388     /// \return true on successful seek and false on a failed seek.
00389     inline bool ifseek(int64_t offset, int origin)
00390     {
00391         if (myFileTypePtr == NULL)
00392         {
00393             // No myFileTypePtr, so return false - could not seek.
00394             return false;
00395         }
00396         // TODO - may be able to seek within the buffer if applicable.
00397         // Reset buffering since a seek is being done.
00398         myBufferIndex = 0;
00399         myCurrentBufferSize = 0;
00400         return myFileTypePtr->seek(offset, origin);
00401     }
00402 
00403     /// Get the filename that is currently opened.
00404     /// \return filename associated with this class
00405     const char* getFileName() const
00406     {
00407         return(myFileName.c_str());
00408     }
00409 
00410     /// Enable (default) or disable recovery.
00411     /// 
00412     /// When true, we can attach a myFileTypePtr
00413     /// that implements a recovery capable decompressor.
00414     /// This requires that the caller be able to catch
00415     /// the exception XXX "blah blah blah".
00416     ///
00417     void setAttemptRecovery(bool flag = false)
00418     {
00419         myAttemptRecovery = flag;
00420     }
00421 
00422     bool attemptRecoverySync(bool (*checkSignature)(void *data) , int length)
00423     {
00424         if(myFileTypePtr==NULL) return false; 
00425         return myFileTypePtr->attemptRecoverySync(checkSignature, length);
00426     }
00427 
00428     // Open a file. Called by the constructor.
00429     // Returns true if the file was successfully opened, false otherwise.
00430     bool openFile(const char * filename, const char * mode,
00431                   InputFile::ifileCompression compressionMode);
00432 
00433 protected:
00434     // Read into a buffer from the file.  Since the buffer is passed in and
00435     // this would bypass the myFileBuffer used by this class, this method must
00436     // be protected.
00437     inline int readFromFile(void * buffer, unsigned int size)
00438     {
00439         // If no myFileTypePtr, return 0 - nothing read.
00440         if (myFileTypePtr == NULL)
00441         {
00442             return 0;
00443         }
00444         return myFileTypePtr->read(buffer, size);
00445     }
00446 
00447 #ifdef __ZLIB_AVAILABLE__
00448     // Only necessary with zlib to determine what file type on a new
00449     // file.  Without zlib, there are only uncompressed files, so a special
00450     // method is not needed to determine the type of file to open.
00451     // Open a file.  This method will open a file with the specified name and
00452     // mode with the fileTypePtr associated with the specified compressionMode.
00453     void openFileUsingMode(const char* filename, const char* mode,
00454                            InputFile::ifileCompression compressionMode);
00455 #endif
00456 
00457     // The size of the buffer used by this class.
00458     static const unsigned int DEFAULT_BUFFER_SIZE = 1048576;
00459 
00460     // Pointer to a class that interfaces with different file types.
00461     FileType* myFileTypePtr;
00462 
00463     unsigned int myAllocatedBufferSize;
00464 
00465     // Buffer used to do large reads rather than 1 by 1 character reads
00466     // from the file.  The class is then managed to iterate through the buffer.
00467     char* myFileBuffer;
00468 
00469     // Current index into the buffer.  Used to track where we are in reading the
00470     // file from the buffer.
00471     int myBufferIndex;
00472 
00473     // Current number of entries in the buffer.  Used to ensure that
00474     // if a read did not fill the buffer, we stop before hitting the
00475     // end of what was read.
00476     int myCurrentBufferSize;
00477 
00478     std::string myFileName;
00479 };
00480 
00481 
00482 /// Define IFILE as a pointer to an InputFile object.
00483 typedef InputFile* IFILE;
00484 
00485 
00486 /// Open a file.
00487 /// \param filename file to open
00488 /// \param mode same format as fopen: "r" for read & "w" for write.
00489 /// \param compressionMode set the type of file to open for writing or
00490 /// for reading from stdin (when reading files, the compression type is 
00491 /// determined by reading the file).
00492 /// \return IFILE - pointer to the InputFile object that has been opened.
00493 inline IFILE ifopen(const char * filename, const char * mode,
00494                     InputFile::ifileCompression compressionMode = InputFile::DEFAULT)
00495 {
00496     IFILE file = new InputFile(filename, mode, compressionMode);
00497     if (!file->isOpen())
00498     {
00499 
00500         // Not open, so delete the file, and return null.
00501         delete file;
00502         file = NULL;
00503     }
00504     return file;
00505 }
00506 
00507 
00508 /// Close the file.
00509 /// \param file file to be closed - IFILE is a pointer to an InputFile object
00510 /// \return status of the close (0 is success or if NULL is passed in).
00511 inline int ifclose(IFILE file)
00512 {
00513     if(file == NULL)
00514     {
00515         // NULL Pointer passed in, so return 0, since no file is open, so
00516         // does not need to be closed.
00517         return(0);
00518     }
00519     int result = file->ifclose();
00520     delete file;
00521     file = NULL;
00522     return(result);
00523 }
00524 
00525 /// Read size bytes from the file into the buffer.
00526 /// \param file file to be read - IFILE is a pointer to an InputFile object
00527 /// \param buffer pointer to memory at least size bytes big to write the
00528 /// data into.
00529 /// \param size number of bytes to be read
00530 /// \return number of bytes read
00531 inline unsigned int ifread(IFILE file, void * buffer, unsigned int size)
00532 {
00533     if(file == NULL)
00534     {
00535         // No file was passed in, so 0 bytes were read.
00536         return(0);
00537     }
00538     return(file->ifread(buffer, size));
00539 }
00540 
00541 /// Get a character from the file.  Read a character from the internal
00542 /// buffer, or if the end of the buffer has been reached, read from the
00543 /// file into the buffer and return index 0.
00544 /// \param file file to be read - IFILE is a pointer to an InputFile object
00545 /// \return character that was read or EOF.
00546 inline int ifgetc(IFILE file)
00547 {
00548     if(file == NULL)
00549     {
00550         // return eof since there is no file.
00551         return(EOF);
00552     }
00553     return(file->ifgetc());
00554 }
00555 
00556 /// Reset to the beginning of the file.
00557 /// \param file file to be rewound - IFILE is a pointer to an InputFile object
00558 inline void ifrewind(IFILE file)
00559 {
00560     if(file == NULL)
00561     {
00562         return;
00563     }
00564     file->ifrewind();
00565 }
00566 
00567 /// Check to see if we have reached the EOF.
00568 /// \param file file to be checked - IFILE is a pointer to an InputFile object
00569 /// \return 0 if not EOF, any other value means EOF.
00570 inline int ifeof(IFILE file)
00571 {
00572     if(file == NULL)
00573     {
00574         // No file, so that is considered to be EOF, so return 1.
00575         return(1);
00576     }
00577     return(file->ifeof());
00578 }
00579 
00580 /// Write the specified buffer into the file.
00581 /// \param file file to write to - IFILE is a pointer to an InputFile object
00582 /// \param buffer buffer containing size bytes to write to the file.
00583 /// \param size number of bytes to write
00584 /// \return number of bytes written
00585 inline unsigned int ifwrite(IFILE file, const void * buffer, unsigned int size)
00586 {
00587     if(file == NULL)
00588     {
00589         // No file specified, so retun 0 bytes written.
00590         return(0);
00591     }
00592     return(file->ifwrite(buffer, size));
00593 }
00594 
00595 /// Get current position in the file.
00596 /// \param file file to perform tell on - IFILE is a pointer to an InputFile object
00597 /// \return current position in the file, -1 indicates an error.
00598 inline int64_t iftell(IFILE file)
00599 {
00600     if(file == NULL)
00601     {
00602         return(-1);
00603     }
00604     return (file->iftell());
00605 }
00606 
00607 /// Seek to the specified offset from the origin.
00608 /// \param file file to perform seek on - IFILE is a pointer to an InputFile object
00609 /// \param offset offset into the file to move to (must be from a tell call)
00610 /// \param origin can be any of the following:
00611 /// Note: not all are valid for all filetypes.
00612 ///   SEEK_SET - Beginning of file
00613 ///   SEEK_CUR - Current position of the file pointer
00614 ///   SEEK_END - End of file
00615 /// \return true on successful seek and false on a failed seek.
00616 inline bool ifseek(IFILE file, int64_t offset, int origin)
00617 {
00618     if(file == NULL)
00619     {
00620         // Could not see since no file was specified.
00621         return(false);
00622     }
00623     return (file->ifseek(offset, origin));
00624 }
00625 
00626 /// Write to a file using fprintf format.
00627 /// \param file file to write to - IFILE is a pointer to an InputFile object
00628 /// \param format printf format for writing, followed by parameters.
00629 /// \return number of bytes written
00630 int ifprintf(IFILE output, const char * format, ...);
00631 
00632 /// Read a line from a file using streaming.
00633 /// \param stream file to read from - IFILE is a pointer to an InputFile object
00634 /// \param str output string containing the line read from the file.
00635 inline IFILE operator >> (IFILE stream, std::string &str)
00636 {
00637     str.clear();
00638     int ch;
00639     // not safe... newline handling?
00640     while ((ch = stream->ifgetc())!=EOF && (ch != '\n')) str.push_back(ch);
00641     return stream;
00642 }
00643 
00644 #endif
00645 
Generated on Tue Aug 23 18:19:05 2011 for libStatGen Software by  doxygen 1.6.3