libStatGen Software  1
InputFile.cpp
00001 /*
00002  *  Copyright (C) 2010-2012  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include "InputFile.h"
00019 #include "StringBasics.h"
00020 #include "GzipHeader.h"
00021 #include "BgzfFileType.h"
00022 #include "BgzfFileTypeRecovery.h"
00023 #include "GzipFileType.h"
00024 #include "UncompressedFileType.h"
00025 
00026 #include <stdarg.h>
00027 
00028 InputFile::InputFile(const char * filename, const char * mode,
00029                      InputFile::ifileCompression compressionMode)
00030 {
00031     // XXX duplicate code
00032     myAttemptRecovery = false;
00033     myFileTypePtr = NULL;
00034     myBufferIndex = 0;
00035     myCurrentBufferSize = 0;
00036     myAllocatedBufferSize = DEFAULT_BUFFER_SIZE;
00037     myFileBuffer = new char[myAllocatedBufferSize];
00038     myFileName.clear();
00039 
00040     openFile(filename, mode, compressionMode);
00041 }
00042 
00043 
00044 int InputFile::readTilChar(const std::string& stopChars, std::string& stringRef)
00045 {
00046     int charRead = 0;
00047     size_t pos = std::string::npos;
00048     // Loop until the character was not found in the stop characters.
00049     while(pos == std::string::npos)
00050     {
00051         charRead = ifgetc();
00052 
00053         // First Check for EOF.  If EOF is found, just return -1
00054         if(charRead == EOF)
00055         {
00056             return(-1);
00057         }
00058         
00059         // Try to find the character in the stopChars.
00060         pos = stopChars.find(charRead);
00061 
00062         if(pos == std::string::npos)
00063         {
00064             // Didn't find a stop character and it is not an EOF, 
00065             // so add it to the string.
00066             stringRef += charRead;
00067         }
00068     }
00069     return(pos);
00070 }
00071 
00072 
00073 int InputFile::readTilChar(const std::string& stopChars)
00074 {
00075     int charRead = 0;
00076     size_t pos = std::string::npos;
00077     // Loop until the character was not found in the stop characters.
00078     while(pos == std::string::npos)
00079     {
00080         charRead = ifgetc();
00081 
00082         // First Check for EOF.  If EOF is found, just return -1
00083         if(charRead == EOF)
00084         {
00085             return(-1);
00086         }
00087         
00088         // Try to find the character in the stopChars.
00089         pos = stopChars.find(charRead);
00090     }
00091     return(pos);
00092 }
00093 
00094 
00095 int InputFile::discardLine()
00096 {
00097     int charRead = 0;
00098     // Loop until the character was not found in the stop characters.
00099     while((charRead != EOF) && (charRead != '\n'))
00100     {
00101         charRead = ifgetc();
00102     }
00103     // First Check for EOF.  If EOF is found, just return -1
00104     if(charRead == EOF)
00105     {
00106         return(-1);
00107     }
00108     return(0);
00109 }
00110 
00111 
00112 int InputFile::readLine(std::string& line)
00113 {
00114     int charRead = 0;
00115     while(!ifeof())
00116     {
00117         charRead = ifgetc();
00118         if(charRead == EOF)
00119         {
00120             return(-1);
00121         }
00122         if(charRead == '\n')
00123         {
00124             return(0);
00125         }
00126         line += charRead;
00127     }
00128     // Should never get here.
00129     return(-1);
00130 }
00131 
00132 
00133 int InputFile::readTilTab(std::string& field)
00134 {
00135     int charRead = 0;
00136     while(!ifeof())
00137     {
00138         charRead = ifgetc();
00139         if(charRead == EOF)
00140         {
00141             return(-1);
00142         }
00143         if(charRead == '\n')
00144         {
00145             return(0);
00146         }
00147         if(charRead == '\t')
00148         {
00149             return(1);
00150         }
00151         field += charRead;
00152     }
00153     return(-1);
00154 }
00155 
00156 
00157 #ifdef __ZLIB_AVAILABLE__
00158 
00159 // Open a file. Called by the constructor.
00160 // Returns true if the file was successfully opened, false otherwise.
00161 bool InputFile::openFile(const char * filename, const char * mode,
00162                          InputFile::ifileCompression compressionMode)
00163 {
00164     //
00165     // if recovering, we don't want to issue big readaheads, since
00166     // that interferes with the decompression - we only want to 
00167     // decompress one at a time, and handle the exceptions immediately
00168     // rather than at some indeterminate point in time.
00169     //
00170     if(myAttemptRecovery) {
00171         bufferReads(1);
00172     }
00173     // If a file is for write, just open a new file.
00174     if (mode[0] == 'w' || mode[0] == 'W')
00175     {
00176         openFileUsingMode(filename, mode, compressionMode);
00177     }
00178     else
00179     {
00180         // Check if reading from stdin.
00181         if((strcmp(filename, "-") == 0) || (strcmp(filename, "-.gz") == 0))
00182         {
00183             // Reading from stdin, open it based on the 
00184             // compression mode.
00185             openFileUsingMode(filename, mode, compressionMode);
00186         }
00187         else
00188         {
00189             // Not from stdin, so determine the file type.
00190 
00191             // Open the file read only to determine file type.
00192             UncompressedFileType file(filename, "r");
00193             // If the file could not be opened, either create a new one or
00194             // return failure.
00195             if (!file.isOpen())
00196             {
00197                 // If the mode is for read, then the file must exist, otherwise,
00198                 // create a new file.
00199                 if (mode[0] == 'r' || mode[0] == 'R')
00200                 {
00201                     // File must exist.
00202                     if (myFileTypePtr != NULL)
00203                     {
00204                         delete myFileTypePtr;
00205                         myFileTypePtr = NULL;
00206                     }
00207                     // Return false, was not opened.
00208                     return false;
00209                 }
00210                 else
00211                 {
00212                     openFileUsingMode(filename, mode, compressionMode);
00213                 }
00214             }
00215             else
00216             {
00217                 // File was successfully opened, so try to determine the
00218                 // filetype from the file.
00219                 // Read the file to see if it a gzip file.
00220                 GzipHeader gzipHeader;
00221                 bool isGzip = gzipHeader.readHeader(file);
00222                 
00223                 // The file header has been read, so close the file, so it can
00224                 // be re-opened as the correct type.
00225                 file.close();
00226 
00227                 if (isGzip)
00228                 {
00229                     // This file is a gzip file.
00230                     // Check to see if it is BGZF Compression.
00231                     if (gzipHeader.isBgzfFile())
00232                     {
00233                         // This file has BGZF Compression, so set the file
00234                         // pointer.
00235                         if(myAttemptRecovery) {
00236                             // NB: this reader will throw std::runtime_error when it recovers
00237                             myFileTypePtr = new BgzfFileTypeRecovery(filename, mode);
00238                         } else {
00239                             // use the standard bgzf reader (samtools)
00240                             myFileTypePtr = new BgzfFileType(filename, mode);
00241                         }
00242                     }
00243                     else
00244                     {
00245                         // Not BGZF, just a normal gzip.
00246                         myFileTypePtr = new GzipFileType(filename, mode);
00247                    }
00248                 }
00249                 else
00250                 {
00251                     // The file is a uncompressed, uncompressed file,
00252                     // so set the myFileTypePtr accordingly.
00253                     myFileTypePtr = new UncompressedFileType(filename, mode);
00254                 }
00255             }
00256         }
00257     }
00258     if(myFileTypePtr == NULL)
00259     {
00260         return(false);
00261     }
00262     if (!myFileTypePtr->isOpen())
00263     {
00264         // The file was not opened, so delete the pointer and set to null.
00265         delete myFileTypePtr;
00266         myFileTypePtr = NULL;
00267         return false;
00268     }
00269 
00270     if(myAllocatedBufferSize == 1)
00271     {
00272         myFileTypePtr->setBuffered(false);
00273     }
00274     else
00275     {
00276         myFileTypePtr->setBuffered(true);
00277     }
00278     myFileName = filename;
00279     return true;
00280 }
00281 
00282 
00283 // Open a file.  This method will open a file with the specified name and
00284 // mode with the fileTypePtr associated with the specified compressionMode.
00285 void InputFile::openFileUsingMode(const char * filename, const char * mode,
00286                                   ifileCompression compressionMode)
00287 {
00288     switch (compressionMode)
00289     {
00290         case GZIP:
00291             // Gzipped.
00292             myFileTypePtr = new GzipFileType(filename, mode);
00293             break;
00294         case BGZF:
00295             //
00296             // BGZF compression - recovery is possible, so use
00297             // Bgzf recovery reader if asked.
00298             //
00299             if(myAttemptRecovery && ((mode[0] == 'r') || (mode[0] == 'R')))
00300             {
00301                 // NB: this reader will throw std::runtime_error when it recovers
00302                 myFileTypePtr = new BgzfFileTypeRecovery(filename, mode);
00303             }
00304             else
00305             {
00306                 myFileTypePtr = new BgzfFileType(filename, mode);
00307             }
00308             break;
00309         case UNCOMPRESSED:
00310             myFileTypePtr = new UncompressedFileType(filename, mode);
00311             break;
00312         case InputFile::DEFAULT:
00313         default:
00314             // Check the extension. If it is ".gz", treat as gzip.
00315             // otherwise treat it as UNCOMPRESSED.
00316             int lastchar = 0;
00317             while (filename[lastchar] != 0) lastchar++;
00318             if ((lastchar >= 3 &&
00319                     filename[lastchar - 3] == '.' &&
00320                     filename[lastchar - 2] == 'g' &&
00321                     filename[lastchar - 1] == 'z'))
00322             {
00323                 // .gz files files should be gzipped.
00324                 myFileTypePtr = new GzipFileType(filename, mode);
00325             }
00326             else
00327             {
00328                 // Create an uncompressed file.
00329                 myFileTypePtr = new UncompressedFileType(filename, mode);
00330             }
00331             break;
00332     }
00333 
00334     if(myFileTypePtr == NULL)
00335     {
00336         return;
00337     }
00338     if(myAllocatedBufferSize == 1)
00339     {
00340         myFileTypePtr->setBuffered(false);
00341     }
00342     else
00343     {
00344         myFileTypePtr->setBuffered(true);
00345     }
00346 }
00347 
00348 #else
00349 
00350 // No zlib, so just treat all files as std files.
00351 // Open a file. Called by the constructor.
00352 // Returns true if the file was successfully opened, false otherwise.
00353 bool InputFile::openFile(const char * filename, const char * mode,
00354                          InputFile::ifileCompression compressionMode)
00355 {
00356     //  No zlib, so it is a uncompressed, uncompressed file.
00357     myFileTypePtr = new UncompressedFileType(filename, mode);
00358 
00359     if(myFileTypePtr == NULL)
00360     {
00361         return(false);
00362     }
00363     if (!myFileTypePtr->isOpen())
00364     {
00365         // The file was not opened, so delete the pointer and set to null.
00366         delete myFileTypePtr;
00367         myFileTypePtr = NULL;
00368         return false;
00369     }
00370     if(myAllocatedBufferSize == 1)
00371     {
00372         myFileTypePtr->setBuffered(false);
00373     }
00374     else
00375     {
00376         myFileTypePtr->setBuffered(true);
00377     }
00378     myFileName = filename;
00379     return true;
00380 }
00381 
00382 #endif
00383 
00384 
00385 InputFile::~InputFile()
00386 {
00387     delete myFileTypePtr;
00388     myFileTypePtr = NULL;
00389 
00390     if(myFileBuffer != NULL)
00391     {
00392         delete[] myFileBuffer;
00393         myFileBuffer = NULL;
00394     }
00395 }
00396 
00397 
00398 int ifprintf(IFILE output, const char * format, ...)
00399 {
00400     String buffer;
00401 
00402     va_list  ap;
00403     va_start(ap, format);
00404 
00405     buffer.vprintf(format, ap);
00406 
00407     va_end(ap);
00408 
00409     return ::ifwrite(output, (const char *) buffer, buffer.Length());
00410 }
00411 
00412 
00413 InputFile& operator << (InputFile& stream, double num)
00414 {
00415     String val;
00416     val = num;
00417     stream << val;
00418     return(stream);
00419 }
00420 
00421 
00422 InputFile& operator << (InputFile& stream, int num)
00423 {
00424     String val;
00425     val = num;
00426     stream << val;
00427     return(stream);
00428 }
00429 
00430 
00431 InputFile& operator << (InputFile& stream, unsigned int num)
00432 {
00433     String val;
00434     val = num;
00435     stream << val;
00436     return(stream);
00437 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends