InputFile.h

Go to the documentation of this file.
00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 /*! \file */ 
00018 #ifndef __INPUTFILE_H__
00019 #define __INPUTFILE_H__
00020 
00021 #ifdef  __gnu_linux__
00022 #ifndef __ZLIB_AVAILABLE__
00023 #define __ZLIB_AVAILABLE__
00024 #endif
00025 #endif
00026 
00027 #include <stdio.h>
00028 #include <iostream>
00029 #include <cstring>
00030 #include <stdint.h>
00031 
00032 #include "FileType.h"
00033 
00034 /// Class for easily reading/writing files without having to worry about
00035 /// file type (uncompressed, gzip, bgzf) when reading.
00036 /// It hides the low level file operations/structure from the user, allowing
00037 /// them to generically open and operate on a file using the same
00038 /// interface without knowing the file format (standard uncompressed,
00039 /// gzip, or bgzf).  For writing, the user must specify the file type.
00040 /// There is a typedef IFILE which is InputFile* and setup to mimic FILE
00041 /// including global methods that take IFILE as a parameter.
00042 class InputFile
00043 {
00044     bool    myAttemptRecovery;  // use recovery techniques if possible
00045 public:
00046 
00047     /// Compression to use when writing a file & decompression used when
00048     /// reading a file from stdin.  Any other read checks the file to determine
00049     ///  how to uncompress it.
00050     enum ifileCompression {
00051         DEFAULT,  ///< Check the extension, if it is ".gz", treat as gzip, otherwise treat it as UNCOMPRESSED.
00052         UNCOMPRESSED,  ///< uncompressed file.
00053         GZIP,  ///< gzip file.
00054         BGZF ///< bgzf file.
00055     };
00056 
00057     /// Default constructor
00058     InputFile()
00059     {
00060         myAttemptRecovery = false;
00061         myFileTypePtr = NULL;
00062         myBufferIndex = 0;
00063         myCurrentBufferSize = 0;
00064         // Default to buffer.
00065         myAllocatedBufferSize = DEFAULT_BUFFER_SIZE;
00066         myFileBuffer = new char[myAllocatedBufferSize];
00067         myFileName.clear();
00068     }
00069 
00070     /// Destructor
00071     ~InputFile();
00072 
00073     /// Constructor for opening a file.
00074     /// \param filename file to open
00075     /// \param mode same format as fopen: "r" for read & "w" for write.
00076     /// \param compressionMode set the type of file to open for writing or
00077     /// for reading from stdin (when reading files, the compression type is 
00078     /// determined by reading the file).
00079     InputFile(const char * filename, const char * mode,
00080               InputFile::ifileCompression compressionMode = InputFile::DEFAULT);
00081 
00082     /// Set the buffer size for reading from files so that bufferSize bytes
00083     /// are read at a time and stored until accessed by another read call.
00084     /// This improves performance over reading the file small bits at a time.
00085     /// Buffering reads disables the tell call for bgzf files.
00086     /// Any previous values in the buffer will be deleted.
00087     /// \param bufferSize number of bytes to read/buffer at a time,
00088     /// default buffer size is 1048576, and turn off read buffering by setting
00089     /// bufferSize = 1;
00090     inline void bufferReads(unsigned int bufferSize = DEFAULT_BUFFER_SIZE)
00091     {
00092         // If the buffer size is the same, do nothing.
00093         if(bufferSize == myAllocatedBufferSize)
00094         {
00095             return;
00096         }
00097         // Delete the previous buffer.
00098         if(myFileBuffer != NULL)
00099         {
00100             delete[] myFileBuffer;
00101         }
00102         myBufferIndex = 0;
00103         myCurrentBufferSize = 0;
00104         // The buffer size must be at least 1 so one character can be
00105         // read and ifgetc can just assume reading into the buffer.
00106         if(bufferSize < 1)
00107         {
00108             bufferSize = 1;
00109         }
00110         myFileBuffer = new char[bufferSize];
00111         myAllocatedBufferSize = bufferSize;
00112 
00113         if(myFileTypePtr != NULL)
00114         {
00115             if(bufferSize == 1)
00116             {
00117                 myFileTypePtr->setBuffered(false);
00118             }
00119             else
00120             {
00121                 myFileTypePtr->setBuffered(true);
00122             }
00123         }
00124     }
00125 
00126 
00127     /// Disable read buffering.
00128     inline void disableBuffering()
00129     {
00130         bufferReads(1);
00131         if(myFileTypePtr != NULL)
00132         {
00133             myFileTypePtr->setBuffered(false);
00134         }
00135     }
00136 
00137     
00138     /// Close the file.
00139     /// \return status of the close (0 is success).
00140     inline int ifclose()
00141     {
00142         if (myFileTypePtr == NULL)
00143         {
00144             return EOF;
00145         }
00146         int result = myFileTypePtr->close();
00147         delete myFileTypePtr;
00148         myFileTypePtr = NULL;
00149         myFileName.clear();
00150         return result;
00151     }
00152 
00153     /// Read size bytes from the file into the buffer.
00154     /// \param buffer pointer to memory at least size bytes big to write the
00155     /// data into.
00156     /// \param size number of bytes to be read
00157     /// \return number of bytes read, if it is not equal to size,
00158     /// there was either an error or the end of the file was reached, use
00159     /// ifeof to determine which case it was.
00160     inline int ifread(void * buffer, unsigned int size)
00161     {
00162         // There are 2 cases:
00163         //  1) There are already size available bytes in buffer.
00164         //  2) There are not size bytes in buffer.
00165 
00166         // Determine the number of available bytes in the buffer.
00167         unsigned int availableBytes = myCurrentBufferSize - myBufferIndex;
00168         int returnSize = 0;
00169 
00170         // Case 1: There are already size available bytes in buffer.
00171         if (size <= availableBytes)
00172         {
00173             //   Just copy from the buffer, increment the index and return.
00174             memcpy(buffer, myFileBuffer+myBufferIndex, size);
00175             // Increment the buffer index.
00176             myBufferIndex += size;
00177             returnSize = size;
00178         }
00179         // Case 2: There are not size bytes in buffer.
00180         else
00181         {
00182             // Check to see if there are some bytes in the buffer.
00183             if (availableBytes > 0)
00184             {
00185                 // Size > availableBytes > 0
00186                 // Copy the available bytes into the buffer.
00187                 memcpy(buffer, myFileBuffer+myBufferIndex, availableBytes);
00188             }
00189             // So far availableBytes have been copied into the read buffer.
00190             returnSize = availableBytes;
00191             // Increment myBufferIndex  by what was read.
00192             myBufferIndex += availableBytes;
00193 
00194             unsigned int remainingSize = size - availableBytes;
00195 
00196             // Check if the remaining size is more or less than the
00197             // max buffer size.
00198             if(remainingSize < myAllocatedBufferSize)
00199             {
00200                 // the remaining size is not the full buffer, but read
00201                 //  a full buffer worth of data anyway.
00202                 myCurrentBufferSize =
00203                     readFromFile(myFileBuffer, myAllocatedBufferSize);
00204 
00205                 // Check for an error.
00206                 if(myCurrentBufferSize <= 0)
00207                 {
00208                     // No more data was successfully read, so check to see
00209                     // if any data was copied to the return buffer at all.
00210                     if( returnSize == 0)
00211                     {
00212                         // No data has been copied at all into the
00213                         // return read buffer, so just return the value
00214                         // returned from readFromFile.
00215                         returnSize = myCurrentBufferSize;
00216                         // Otherwise, returnSize is already set to the
00217                         // available bytes that was already copied (so no
00218                         // else statement is needed).
00219                     }
00220                     // Set myBufferIndex & myCurrentBufferSize to 0.
00221                     myCurrentBufferSize = 0;
00222                     myBufferIndex = 0;
00223                 }
00224                 else
00225                 {
00226                     // Successfully read more data.
00227                     // Check to see how much was copied.
00228                     int copySize = remainingSize;
00229                     if(copySize > myCurrentBufferSize)
00230                     {
00231                         // Not the entire requested amount was read
00232                         // (either from EOF or there was a partial read due to
00233                         // an error), so set the copySize to what was read.
00234                         copySize = myCurrentBufferSize;
00235                     }
00236 
00237                     // Now copy the rest of the bytes into the buffer.
00238                     memcpy((char*)buffer+availableBytes, 
00239                            myFileBuffer, copySize);
00240 
00241                     // set the buffer index to the location after what we are
00242                     // returning as read.
00243                     myBufferIndex = copySize;
00244                 
00245                     returnSize += copySize;
00246                 }
00247             }
00248             else
00249             {
00250                 // More remaining to be read than the max buffer size, so just
00251                 // read directly into the output buffer.
00252                 int readSize = readFromFile((char*)buffer + availableBytes,
00253                                             remainingSize);
00254 
00255                 // Already used the buffer, so "clear" it.
00256                 myCurrentBufferSize = 0;
00257                 myBufferIndex = 0;
00258                 if(readSize <= 0)
00259                 {
00260                     // No more data was successfully read, so check to see
00261                     // if any data was copied to the return buffer at all.
00262                     if(returnSize == 0)
00263                     {
00264                         // No data has been copied at all into the
00265                         // return read buffer, so just return the value
00266                         // returned from readFromFile.
00267                         returnSize = readSize;
00268                         // Otherwise, returnSize is already set to the
00269                         // available bytes that was already copied (so no
00270                         // else statement is needed).
00271                     }
00272                 }
00273                 else
00274                 {
00275                     // More data was read, so increment the return count.
00276                     returnSize += readSize;
00277                 }
00278             }
00279         }
00280         return(returnSize);
00281     }
00282 
00283 
00284     /// Get a character from the file.  Read a character from the internal
00285     /// buffer, or if the end of the buffer has been reached, read from the
00286     /// file into the buffer and return index 0.
00287     /// \return character that was read or EOF.
00288     inline int ifgetc()
00289     {
00290         if (myBufferIndex >= myCurrentBufferSize)
00291         {
00292             // at the last index, read a new buffer.
00293             myCurrentBufferSize = readFromFile(myFileBuffer, myAllocatedBufferSize);
00294             myBufferIndex = 0;
00295         }
00296         // If the buffer index is still greater than or equal to the
00297         // myCurrentBufferSize, then we failed to read the file - return EOF.
00298         if (myBufferIndex >= myCurrentBufferSize)
00299         {
00300             return(EOF);
00301         }
00302         return(myFileBuffer[myBufferIndex++]);
00303     }
00304 
00305     /// Reset to the beginning of the file.
00306     inline void ifrewind()
00307     {
00308         // Just set the myBufferIndex and the myCurrentBufferSize to 0 to simulate
00309         // clearing the buffer and call rewind to move to the beginning of the
00310         // file.
00311         if (myFileTypePtr == NULL)
00312         {
00313             // No pointer, so nothing to rewind.
00314             return;
00315         }
00316         myCurrentBufferSize = 0;
00317         myBufferIndex = 0;
00318         myFileTypePtr->rewind();
00319     }
00320 
00321 
00322     /// Check to see if we have reached the EOF.
00323     /// \return 0 if not EOF, any other value means EOF.
00324     inline int ifeof()
00325     {
00326         // Not EOF if we are not at the end of the buffer.
00327         if (myBufferIndex < myCurrentBufferSize)
00328         {
00329             // There are still available bytes in the buffer, so NOT EOF.
00330             return false;
00331         }
00332         else
00333         {
00334             if (myFileTypePtr == NULL)
00335             {
00336                 // No myFileTypePtr, so not eof (return 0).
00337                 return 0;
00338             }
00339             // exhausted our buffer, so check the file for eof.
00340             return myFileTypePtr->eof();
00341         }
00342     }
00343 
00344     /// Write the specified buffer into the file.
00345     /// \param buffer buffer containing size bytes to write to the file.
00346     /// \param size number of bytes to write
00347     /// \return number of bytes written
00348     /// We do not buffer the write call, so just leave this as normal.
00349     inline unsigned int ifwrite(const void * buffer, unsigned int size)
00350     {
00351         if (myFileTypePtr == NULL)
00352         {
00353             // No myFileTypePtr, so return 0 - nothing written.
00354             return 0;
00355         }
00356         return myFileTypePtr->write(buffer, size);
00357     }
00358 
00359     /// Returns whether or not the file was successfully opened.
00360     /// \return true if the file is open, false if not.
00361     inline bool isOpen()
00362     {
00363         // It is open if the myFileTypePtr is set and says it is open.
00364         if ((myFileTypePtr != NULL) && myFileTypePtr->isOpen())
00365         {
00366             return true;
00367         }
00368         // File was not successfully opened.
00369         return false;
00370     }
00371 
00372     /// Get current position in the file.
00373     /// \return current position in the file, -1 indicates an error.
00374     inline int64_t iftell()
00375     {
00376         if (myFileTypePtr == NULL)
00377         {
00378             // No myFileTypePtr, so return false - could not seek.
00379             return -1;
00380         }
00381         int64_t pos = myFileTypePtr->tell();
00382         pos -= (myCurrentBufferSize - myBufferIndex);
00383         return(pos);
00384     }
00385 
00386 
00387     /// Seek to the specified offset from the origin.
00388     /// \param offset offset into the file to move to (must be from a tell call)
00389     /// \param origin can be any of the following:
00390     /// Note: not all are valid for all filetypes.
00391     ///   SEEK_SET - Beginning of file
00392     ///   SEEK_CUR - Current position of the file pointer
00393     ///   SEEK_END - End of file
00394     /// \return true on successful seek and false on a failed seek.
00395     inline bool ifseek(int64_t offset, int origin)
00396     {
00397         if (myFileTypePtr == NULL)
00398         {
00399             // No myFileTypePtr, so return false - could not seek.
00400             return false;
00401         }
00402         // TODO - may be able to seek within the buffer if applicable.
00403         // Reset buffering since a seek is being done.
00404         myBufferIndex = 0;
00405         myCurrentBufferSize = 0;
00406         return myFileTypePtr->seek(offset, origin);
00407     }
00408 
00409     /// Get the filename that is currently opened.
00410     /// \return filename associated with this class
00411     const char* getFileName() const
00412     {
00413         return(myFileName.c_str());
00414     }
00415 
00416     /// Enable (default) or disable recovery.
00417     /// 
00418     /// When true, we can attach a myFileTypePtr
00419     /// that implements a recovery capable decompressor.
00420     /// This requires that the caller be able to catch
00421     /// the exception XXX "blah blah blah".
00422     ///
00423     void setAttemptRecovery(bool flag = false)
00424     {
00425         myAttemptRecovery = flag;
00426     }
00427 
00428     bool attemptRecoverySync(bool (*checkSignature)(void *data) , int length)
00429     {
00430         if(myFileTypePtr==NULL) return false; 
00431         return myFileTypePtr->attemptRecoverySync(checkSignature, length);
00432     }
00433 
00434     // Open a file. Called by the constructor.
00435     // Returns true if the file was successfully opened, false otherwise.
00436     bool openFile(const char * filename, const char * mode,
00437                   InputFile::ifileCompression compressionMode);
00438 
00439 protected:
00440     // Read into a buffer from the file.  Since the buffer is passed in and
00441     // this would bypass the myFileBuffer used by this class, this method must
00442     // be protected.
00443     inline int readFromFile(void * buffer, unsigned int size)
00444     {
00445         // If no myFileTypePtr, return 0 - nothing read.
00446         if (myFileTypePtr == NULL)
00447         {
00448             return 0;
00449         }
00450         return myFileTypePtr->read(buffer, size);
00451     }
00452 
00453 #ifdef __ZLIB_AVAILABLE__
00454     // Only necessary with zlib to determine what file type on a new
00455     // file.  Without zlib, there are only uncompressed files, so a special
00456     // method is not needed to determine the type of file to open.
00457     // Open a file.  This method will open a file with the specified name and
00458     // mode with the fileTypePtr associated with the specified compressionMode.
00459     void openFileUsingMode(const char* filename, const char* mode,
00460                            InputFile::ifileCompression compressionMode);
00461 #endif
00462 
00463     // The size of the buffer used by this class.
00464     static const unsigned int DEFAULT_BUFFER_SIZE = 1048576;
00465 
00466     // Pointer to a class that interfaces with different file types.
00467     FileType* myFileTypePtr;
00468 
00469     unsigned int myAllocatedBufferSize;
00470 
00471     // Buffer used to do large reads rather than 1 by 1 character reads
00472     // from the file.  The class is then managed to iterate through the buffer.
00473     char* myFileBuffer;
00474 
00475     // Current index into the buffer.  Used to track where we are in reading the
00476     // file from the buffer.
00477     int myBufferIndex;
00478 
00479     // Current number of entries in the buffer.  Used to ensure that
00480     // if a read did not fill the buffer, we stop before hitting the
00481     // end of what was read.
00482     int myCurrentBufferSize;
00483 
00484     std::string myFileName;
00485 };
00486 
00487 
00488 /// Define IFILE as a pointer to an InputFile object.
00489 typedef InputFile* IFILE;
00490 
00491 
00492 /// Open a file with the specified name and mode, using a filename of "-" to 
00493 /// indicate stdin/stdout.
00494 /// \param filename file to open ("-" meands stdin/stdout)
00495 /// \param mode same format as fopen: "r" for read & "w" for write.
00496 /// \param compressionMode set the type of file to open for writing or
00497 /// for reading from stdin (when reading files not from stdin, the compression
00498 /// type is determined by reading the file).
00499 /// \return IFILE - pointer to the InputFile object that has been opened.
00500 inline IFILE ifopen(const char * filename, const char * mode,
00501                     InputFile::ifileCompression compressionMode = InputFile::DEFAULT)
00502 {
00503     IFILE file = new InputFile(filename, mode, compressionMode);
00504     if (!file->isOpen())
00505     {
00506 
00507         // Not open, so delete the file, and return null.
00508         delete file;
00509         file = NULL;
00510     }
00511     return file;
00512 }
00513 
00514 
00515 /// Close the file.
00516 /// \param file file to be closed - IFILE is a pointer to an InputFile object
00517 /// \return status of the close (0 is success or if NULL is passed in).
00518 inline int ifclose(IFILE file)
00519 {
00520     if(file == NULL)
00521     {
00522         // NULL Pointer passed in, so return 0, since no file is open, so
00523         // does not need to be closed.
00524         return(0);
00525     }
00526     int result = file->ifclose();
00527     delete file;
00528     file = NULL;
00529     return(result);
00530 }
00531 
00532 /// Read up to size bytes from the file into the buffer.
00533 /// \param file file to be read - IFILE is a pointer to an InputFile object
00534 /// \param buffer pointer to memory at least size bytes big to write the
00535 /// data into.
00536 /// \param size number of bytes to be read
00537 /// \return number of bytes read
00538 inline unsigned int ifread(IFILE file, void * buffer, unsigned int size)
00539 {
00540     if(file == NULL)
00541     {
00542         // No file was passed in, so 0 bytes were read.
00543         return(0);
00544     }
00545     return(file->ifread(buffer, size));
00546 }
00547 
00548 /// Get a character from the file.  Read a character from the internal
00549 /// buffer, or if the end of the buffer has been reached, read from the
00550 /// file into the buffer and return index 0.
00551 /// \param file file to be read - IFILE is a pointer to an InputFile object
00552 /// \return character that was read or EOF.
00553 inline int ifgetc(IFILE file)
00554 {
00555     if(file == NULL)
00556     {
00557         // return eof since there is no file.
00558         return(EOF);
00559     }
00560     return(file->ifgetc());
00561 }
00562 
00563 /// Reset to the beginning of the file (cannot be done for stdin/stdout).
00564 /// \param file file to be rewound - IFILE is a pointer to an InputFile object
00565 inline void ifrewind(IFILE file)
00566 {
00567     if(file == NULL)
00568     {
00569         return;
00570     }
00571     file->ifrewind();
00572 }
00573 
00574 /// Check to see if we have reached the EOF (returns 0 if not EOF).
00575 /// \param file file to be checked - IFILE is a pointer to an InputFile object
00576 /// \return 0 if not EOF, any other value means EOF.
00577 inline int ifeof(IFILE file)
00578 {
00579     if(file == NULL)
00580     {
00581         // No file, so that is considered to be EOF, so return 1.
00582         return(1);
00583     }
00584     return(file->ifeof());
00585 }
00586 
00587 /// Write the specified number of bytes from the specified buffer into the file.
00588 /// \param file file to write to - IFILE is a pointer to an InputFile object
00589 /// \param buffer buffer containing size bytes to write to the file.
00590 /// \param size number of bytes to write
00591 /// \return number of bytes written
00592 inline unsigned int ifwrite(IFILE file, const void * buffer, unsigned int size)
00593 {
00594     if(file == NULL)
00595     {
00596         // No file specified, so retun 0 bytes written.
00597         return(0);
00598     }
00599     return(file->ifwrite(buffer, size));
00600 }
00601 
00602 /// Get current position in the file.  Can be fed back into ifseek.
00603 /// \param file file to perform tell on - IFILE is a pointer to an InputFile object
00604 /// \return current position in the file, -1 indicates an error.
00605 inline int64_t iftell(IFILE file)
00606 {
00607     if(file == NULL)
00608     {
00609         return(-1);
00610     }
00611     return (file->iftell());
00612 }
00613 
00614 /// Seek to the specified position (result from an iftell), but cannot
00615 /// be done for stdin/stdout.
00616 /// \param file file to perform seek on - IFILE is a pointer to an InputFile object
00617 /// \param offset offset into the file to move to (must be from a tell call)
00618 /// \param origin can be any of the following:
00619 /// Note: not all are valid for all filetypes.
00620 ///   SEEK_SET - Beginning of file
00621 ///   SEEK_CUR - Current position of the file pointer
00622 ///   SEEK_END - End of file
00623 /// \return true on successful seek and false on a failed seek.
00624 inline bool ifseek(IFILE file, int64_t offset, int origin)
00625 {
00626     if(file == NULL)
00627     {
00628         // Could not see since no file was specified.
00629         return(false);
00630     }
00631     return (file->ifseek(offset, origin));
00632 }
00633 
00634 /// Write to a file using fprintf format.
00635 /// \param file file to write to - IFILE is a pointer to an InputFile object
00636 /// \param format printf format for writing, followed by parameters.
00637 /// \return number of bytes written
00638 int ifprintf(IFILE output, const char * format, ...);
00639 
00640 /// Read a line from a file using streaming.
00641 /// \param stream file to read from - IFILE is a pointer to an InputFile object
00642 /// \param str output string containing the line read from the file.
00643 inline IFILE operator >> (IFILE stream, std::string &str)
00644 {
00645     str.clear();
00646     int ch;
00647     // not safe... newline handling?
00648     while ((ch = stream->ifgetc())!=EOF && (ch != '\n')) str.push_back(ch);
00649     return stream;
00650 }
00651 
00652 #endif
00653 
Generated on Tue Sep 6 17:52:00 2011 for libStatGen Software by  doxygen 1.6.3