libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010-2012 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #include "InputFile.h" 00019 #include "StringBasics.h" 00020 #include "GzipHeader.h" 00021 #include "BgzfFileType.h" 00022 #include "BgzfFileTypeRecovery.h" 00023 #include "GzipFileType.h" 00024 #include "UncompressedFileType.h" 00025 00026 #include <stdarg.h> 00027 00028 InputFile::InputFile(const char * filename, const char * mode, 00029 InputFile::ifileCompression compressionMode) 00030 { 00031 // XXX duplicate code 00032 myAttemptRecovery = false; 00033 myFileTypePtr = NULL; 00034 myBufferIndex = 0; 00035 myCurrentBufferSize = 0; 00036 myAllocatedBufferSize = DEFAULT_BUFFER_SIZE; 00037 myFileBuffer = new char[myAllocatedBufferSize]; 00038 myFileName.clear(); 00039 00040 openFile(filename, mode, compressionMode); 00041 } 00042 00043 00044 int InputFile::readTilChar(const std::string& stopChars, std::string& stringRef) 00045 { 00046 int charRead = 0; 00047 size_t pos = std::string::npos; 00048 // Loop until the character was not found in the stop characters. 00049 while(pos == std::string::npos) 00050 { 00051 charRead = ifgetc(); 00052 00053 // First Check for EOF. If EOF is found, just return -1 00054 if(charRead == EOF) 00055 { 00056 return(-1); 00057 } 00058 00059 // Try to find the character in the stopChars. 00060 pos = stopChars.find(charRead); 00061 00062 if(pos == std::string::npos) 00063 { 00064 // Didn't find a stop character and it is not an EOF, 00065 // so add it to the string. 00066 stringRef += charRead; 00067 } 00068 } 00069 return(pos); 00070 } 00071 00072 00073 int InputFile::readTilChar(const std::string& stopChars) 00074 { 00075 int charRead = 0; 00076 size_t pos = std::string::npos; 00077 // Loop until the character was not found in the stop characters. 00078 while(pos == std::string::npos) 00079 { 00080 charRead = ifgetc(); 00081 00082 // First Check for EOF. If EOF is found, just return -1 00083 if(charRead == EOF) 00084 { 00085 return(-1); 00086 } 00087 00088 // Try to find the character in the stopChars. 00089 pos = stopChars.find(charRead); 00090 } 00091 return(pos); 00092 } 00093 00094 00095 int InputFile::discardLine() 00096 { 00097 int charRead = 0; 00098 // Loop until the character was not found in the stop characters. 00099 while((charRead != EOF) && (charRead != '\n')) 00100 { 00101 charRead = ifgetc(); 00102 } 00103 // First Check for EOF. If EOF is found, just return -1 00104 if(charRead == EOF) 00105 { 00106 return(-1); 00107 } 00108 return(0); 00109 } 00110 00111 00112 int InputFile::readLine(std::string& line) 00113 { 00114 int charRead = 0; 00115 while(!ifeof()) 00116 { 00117 charRead = ifgetc(); 00118 if(charRead == EOF) 00119 { 00120 return(-1); 00121 } 00122 if(charRead == '\n') 00123 { 00124 return(0); 00125 } 00126 line += charRead; 00127 } 00128 // Should never get here. 00129 return(-1); 00130 } 00131 00132 00133 int InputFile::readTilTab(std::string& field) 00134 { 00135 int charRead = 0; 00136 while(!ifeof()) 00137 { 00138 charRead = ifgetc(); 00139 if(charRead == EOF) 00140 { 00141 return(-1); 00142 } 00143 if(charRead == '\n') 00144 { 00145 return(0); 00146 } 00147 if(charRead == '\t') 00148 { 00149 return(1); 00150 } 00151 field += charRead; 00152 } 00153 return(-1); 00154 } 00155 00156 00157 #ifdef __ZLIB_AVAILABLE__ 00158 00159 // Open a file. Called by the constructor. 00160 // Returns true if the file was successfully opened, false otherwise. 00161 bool InputFile::openFile(const char * filename, const char * mode, 00162 InputFile::ifileCompression compressionMode) 00163 { 00164 // 00165 // if recovering, we don't want to issue big readaheads, since 00166 // that interferes with the decompression - we only want to 00167 // decompress one at a time, and handle the exceptions immediately 00168 // rather than at some indeterminate point in time. 00169 // 00170 if(myAttemptRecovery) { 00171 bufferReads(1); 00172 } 00173 // If a file is for write, just open a new file. 00174 if (mode[0] == 'w' || mode[0] == 'W') 00175 { 00176 openFileUsingMode(filename, mode, compressionMode); 00177 } 00178 else 00179 { 00180 // Check if reading from stdin. 00181 if((strcmp(filename, "-") == 0) || (strcmp(filename, "-.gz") == 0)) 00182 { 00183 // Reading from stdin, open it based on the 00184 // compression mode. 00185 openFileUsingMode(filename, mode, compressionMode); 00186 } 00187 else 00188 { 00189 // Not from stdin, so determine the file type. 00190 00191 // Open the file read only to determine file type. 00192 UncompressedFileType file(filename, "r"); 00193 // If the file could not be opened, either create a new one or 00194 // return failure. 00195 if (!file.isOpen()) 00196 { 00197 // If the mode is for read, then the file must exist, otherwise, 00198 // create a new file. 00199 if (mode[0] == 'r' || mode[0] == 'R') 00200 { 00201 // File must exist. 00202 if (myFileTypePtr != NULL) 00203 { 00204 delete myFileTypePtr; 00205 myFileTypePtr = NULL; 00206 } 00207 // Return false, was not opened. 00208 return false; 00209 } 00210 else 00211 { 00212 openFileUsingMode(filename, mode, compressionMode); 00213 } 00214 } 00215 else 00216 { 00217 // File was successfully opened, so try to determine the 00218 // filetype from the file. 00219 // Read the file to see if it a gzip file. 00220 GzipHeader gzipHeader; 00221 bool isGzip = gzipHeader.readHeader(file); 00222 00223 // The file header has been read, so close the file, so it can 00224 // be re-opened as the correct type. 00225 file.close(); 00226 00227 if (isGzip) 00228 { 00229 // This file is a gzip file. 00230 // Check to see if it is BGZF Compression. 00231 if (gzipHeader.isBgzfFile()) 00232 { 00233 // This file has BGZF Compression, so set the file 00234 // pointer. 00235 if(myAttemptRecovery) { 00236 // NB: this reader will throw std::runtime_error when it recovers 00237 myFileTypePtr = new BgzfFileTypeRecovery(filename, mode); 00238 } else { 00239 // use the standard bgzf reader (samtools) 00240 myFileTypePtr = new BgzfFileType(filename, mode); 00241 } 00242 } 00243 else 00244 { 00245 // Not BGZF, just a normal gzip. 00246 myFileTypePtr = new GzipFileType(filename, mode); 00247 } 00248 } 00249 else 00250 { 00251 // The file is a uncompressed, uncompressed file, 00252 // so set the myFileTypePtr accordingly. 00253 myFileTypePtr = new UncompressedFileType(filename, mode); 00254 } 00255 } 00256 } 00257 } 00258 if(myFileTypePtr == NULL) 00259 { 00260 return(false); 00261 } 00262 if (!myFileTypePtr->isOpen()) 00263 { 00264 // The file was not opened, so delete the pointer and set to null. 00265 delete myFileTypePtr; 00266 myFileTypePtr = NULL; 00267 return false; 00268 } 00269 00270 if(myAllocatedBufferSize == 1) 00271 { 00272 myFileTypePtr->setBuffered(false); 00273 } 00274 else 00275 { 00276 myFileTypePtr->setBuffered(true); 00277 } 00278 myFileName = filename; 00279 return true; 00280 } 00281 00282 00283 // Open a file. This method will open a file with the specified name and 00284 // mode with the fileTypePtr associated with the specified compressionMode. 00285 void InputFile::openFileUsingMode(const char * filename, const char * mode, 00286 ifileCompression compressionMode) 00287 { 00288 switch (compressionMode) 00289 { 00290 case GZIP: 00291 // Gzipped. 00292 myFileTypePtr = new GzipFileType(filename, mode); 00293 break; 00294 case BGZF: 00295 // 00296 // BGZF compression - recovery is possible, so use 00297 // Bgzf recovery reader if asked. 00298 // 00299 if(myAttemptRecovery && ((mode[0] == 'r') || (mode[0] == 'R'))) 00300 { 00301 // NB: this reader will throw std::runtime_error when it recovers 00302 myFileTypePtr = new BgzfFileTypeRecovery(filename, mode); 00303 } 00304 else 00305 { 00306 myFileTypePtr = new BgzfFileType(filename, mode); 00307 } 00308 break; 00309 case UNCOMPRESSED: 00310 myFileTypePtr = new UncompressedFileType(filename, mode); 00311 break; 00312 case InputFile::DEFAULT: 00313 default: 00314 // Check the extension. If it is ".gz", treat as gzip. 00315 // otherwise treat it as UNCOMPRESSED. 00316 int lastchar = 0; 00317 while (filename[lastchar] != 0) lastchar++; 00318 if ((lastchar >= 3 && 00319 filename[lastchar - 3] == '.' && 00320 filename[lastchar - 2] == 'g' && 00321 filename[lastchar - 1] == 'z')) 00322 { 00323 // .gz files files should be gzipped. 00324 myFileTypePtr = new GzipFileType(filename, mode); 00325 } 00326 else 00327 { 00328 // Create an uncompressed file. 00329 myFileTypePtr = new UncompressedFileType(filename, mode); 00330 } 00331 break; 00332 } 00333 00334 if(myFileTypePtr == NULL) 00335 { 00336 return; 00337 } 00338 if(myAllocatedBufferSize == 1) 00339 { 00340 myFileTypePtr->setBuffered(false); 00341 } 00342 else 00343 { 00344 myFileTypePtr->setBuffered(true); 00345 } 00346 } 00347 00348 #else 00349 00350 // No zlib, so just treat all files as std files. 00351 // Open a file. Called by the constructor. 00352 // Returns true if the file was successfully opened, false otherwise. 00353 bool InputFile::openFile(const char * filename, const char * mode, 00354 InputFile::ifileCompression compressionMode) 00355 { 00356 // No zlib, so it is a uncompressed, uncompressed file. 00357 myFileTypePtr = new UncompressedFileType(filename, mode); 00358 00359 if(myFileTypePtr == NULL) 00360 { 00361 return(false); 00362 } 00363 if (!myFileTypePtr->isOpen()) 00364 { 00365 // The file was not opened, so delete the pointer and set to null. 00366 delete myFileTypePtr; 00367 myFileTypePtr = NULL; 00368 return false; 00369 } 00370 if(myAllocatedBufferSize == 1) 00371 { 00372 myFileTypePtr->setBuffered(false); 00373 } 00374 else 00375 { 00376 myFileTypePtr->setBuffered(true); 00377 } 00378 myFileName = filename; 00379 return true; 00380 } 00381 00382 #endif 00383 00384 00385 InputFile::~InputFile() 00386 { 00387 delete myFileTypePtr; 00388 myFileTypePtr = NULL; 00389 00390 if(myFileBuffer != NULL) 00391 { 00392 delete[] myFileBuffer; 00393 myFileBuffer = NULL; 00394 } 00395 } 00396 00397 00398 int ifprintf(IFILE output, const char * format, ...) 00399 { 00400 String buffer; 00401 00402 va_list ap; 00403 va_start(ap, format); 00404 00405 buffer.vprintf(format, ap); 00406 00407 va_end(ap); 00408 00409 return ::ifwrite(output, (const char *) buffer, buffer.Length()); 00410 } 00411 00412 00413 InputFile& operator << (InputFile& stream, double num) 00414 { 00415 String val; 00416 val = num; 00417 stream << val; 00418 return(stream); 00419 } 00420 00421 00422 InputFile& operator << (InputFile& stream, int num) 00423 { 00424 String val; 00425 val = num; 00426 stream << val; 00427 return(stream); 00428 } 00429 00430 00431 InputFile& operator << (InputFile& stream, unsigned int num) 00432 { 00433 String val; 00434 val = num; 00435 stream << val; 00436 return(stream); 00437 }