libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 #include <stdexcept> 00018 #include "SamFile.h" 00019 #include "SamFileHeader.h" 00020 #include "SamRecord.h" 00021 #include "BamInterface.h" 00022 #include "SamInterface.h" 00023 #include "BgzfFileType.h" 00024 00025 // Constructor, init variables. 00026 SamFile::SamFile() 00027 : myStatus() 00028 { 00029 init(); 00030 resetFile(); 00031 } 00032 00033 00034 // Constructor, init variables. 00035 SamFile::SamFile(ErrorHandler::HandlingType errorHandlingType) 00036 : myStatus(errorHandlingType) 00037 { 00038 init(); 00039 resetFile(); 00040 } 00041 00042 00043 // Constructor, init variables and open the specified file based on the 00044 // specified mode (READ/WRITE). 00045 SamFile::SamFile(const char* filename, OpenType mode) 00046 : myStatus() 00047 { 00048 init(filename, mode, NULL); 00049 } 00050 00051 00052 // Constructor, init variables and open the specified file based on the 00053 // specified mode (READ/WRITE). Default is READ.. 00054 SamFile::SamFile(const char* filename, OpenType mode, 00055 ErrorHandler::HandlingType errorHandlingType) 00056 : myStatus(errorHandlingType) 00057 { 00058 init(filename, mode, NULL); 00059 } 00060 00061 00062 // Constructor, init variables and open the specified file based on the 00063 // specified mode (READ/WRITE). 00064 SamFile::SamFile(const char* filename, OpenType mode, SamFileHeader* header) 00065 : myStatus() 00066 { 00067 init(filename, mode, header); 00068 } 00069 00070 00071 // Constructor, init variables and open the specified file based on the 00072 // specified mode (READ/WRITE). Default is READ.. 00073 SamFile::SamFile(const char* filename, OpenType mode, 00074 ErrorHandler::HandlingType errorHandlingType, 00075 SamFileHeader* header) 00076 : myStatus(errorHandlingType) 00077 { 00078 init(filename, mode, header); 00079 } 00080 00081 00082 SamFile::~SamFile() 00083 { 00084 resetFile(); 00085 if(myStatistics != NULL) 00086 { 00087 delete myStatistics; 00088 } 00089 } 00090 00091 00092 // Open a sam/bam file for reading with the specified filename. 00093 bool SamFile::OpenForRead(const char * filename, SamFileHeader* header) 00094 { 00095 // Reset for any previously operated on files. 00096 resetFile(); 00097 00098 int lastchar = 0; 00099 00100 while (filename[lastchar] != 0) lastchar++; 00101 00102 // If at least one character, check for '-'. 00103 if((lastchar >= 1) && (filename[0] == '-')) 00104 { 00105 // Read from stdin - determine type of file to read. 00106 // Determine if compressed bam. 00107 if(strcmp(filename, "-.bam") == 0) 00108 { 00109 // Compressed bam - open as bgzf. 00110 // -.bam is the filename, read compressed bam from stdin 00111 filename = "-"; 00112 00113 myFilePtr = new InputFile; 00114 // support recover mode - this switches in a reader 00115 // capable of recovering from bad BGZF compression blocks. 00116 myFilePtr->setAttemptRecovery(myAttemptRecovery); 00117 myFilePtr->openFile(filename, "rb", InputFile::BGZF); 00118 00119 myInterfacePtr = new BamInterface; 00120 00121 // Read the magic string. 00122 char magic[4]; 00123 ifread(myFilePtr, magic, 4); 00124 } 00125 else if(strcmp(filename, "-.ubam") == 0) 00126 { 00127 // uncompressed BAM File. 00128 // -.ubam is the filename, read uncompressed bam from stdin. 00129 // uncompressed BAM is still compressed with BGZF, but using 00130 // compression level 0, so still open as BGZF since it has a 00131 // BGZF header. 00132 filename = "-"; 00133 00134 // Uncompressed, so do not require the eof block. 00135 #ifdef __ZLIB_AVAILABLE__ 00136 BgzfFileType::setRequireEofBlock(false); 00137 #endif 00138 myFilePtr = ifopen(filename, "rb", InputFile::BGZF); 00139 00140 myInterfacePtr = new BamInterface; 00141 00142 // Read the magic string. 00143 char magic[4]; 00144 ifread(myFilePtr, magic, 4); 00145 } 00146 else if((strcmp(filename, "-") == 0) || (strcmp(filename, "-.sam") == 0)) 00147 { 00148 // SAM File. 00149 // read sam from stdin 00150 filename = "-"; 00151 myFilePtr = ifopen(filename, "rb", InputFile::UNCOMPRESSED); 00152 myInterfacePtr = new SamInterface; 00153 } 00154 else 00155 { 00156 std::string errorMessage = "Invalid SAM/BAM filename, "; 00157 errorMessage += filename; 00158 errorMessage += ". From stdin, can only be '-', '-.sam', '-.bam', or '-.ubam'"; 00159 myStatus.setStatus(SamStatus::FAIL_IO, errorMessage.c_str()); 00160 delete myFilePtr; 00161 myFilePtr = NULL; 00162 return(false); 00163 } 00164 } 00165 else 00166 { 00167 // Not from stdin. Read the file to determine the type. 00168 00169 myFilePtr = new InputFile; 00170 00171 // support recovery mode - this conditionally enables a reader 00172 // capable of recovering from bad BGZF compression blocks. 00173 myFilePtr->setAttemptRecovery(myAttemptRecovery); 00174 bool rc = myFilePtr->openFile(filename, "rb", InputFile::DEFAULT); 00175 00176 if (rc == false) 00177 { 00178 std::string errorMessage = "Failed to Open "; 00179 errorMessage += filename; 00180 errorMessage += " for reading"; 00181 myStatus.setStatus(SamStatus::FAIL_IO, errorMessage.c_str()); 00182 delete myFilePtr; 00183 myFilePtr = NULL; 00184 return(false); 00185 } 00186 00187 char magic[4]; 00188 ifread(myFilePtr, magic, 4); 00189 00190 if (magic[0] == 'B' && magic[1] == 'A' && magic[2] == 'M' && 00191 magic[3] == 1) 00192 { 00193 myInterfacePtr = new BamInterface; 00194 // Set that it is a bam file open for reading. This is needed to 00195 // determine if an index file can be used. 00196 myIsBamOpenForRead = true; 00197 } 00198 else 00199 { 00200 // Not a bam, so rewind to the beginning of the file so it 00201 // can be read. 00202 ifrewind(myFilePtr); 00203 myInterfacePtr = new SamInterface; 00204 } 00205 } 00206 00207 // File is open for reading. 00208 myIsOpenForRead = true; 00209 00210 // Read the header if one was passed in. 00211 if(header != NULL) 00212 { 00213 return(ReadHeader(*header)); 00214 } 00215 00216 // Successfully opened the file. 00217 myStatus = SamStatus::SUCCESS; 00218 return(true); 00219 } 00220 00221 00222 // Open a sam/bam file for writing with the specified filename. 00223 bool SamFile::OpenForWrite(const char * filename, SamFileHeader* header) 00224 { 00225 // Reset for any previously operated on files. 00226 resetFile(); 00227 00228 int lastchar = 0; 00229 while (filename[lastchar] != 0) lastchar++; 00230 if (lastchar >= 4 && 00231 filename[lastchar - 4] == 'u' && 00232 filename[lastchar - 3] == 'b' && 00233 filename[lastchar - 2] == 'a' && 00234 filename[lastchar - 1] == 'm') 00235 { 00236 // BAM File. 00237 // if -.ubam is the filename, write uncompressed bam to stdout 00238 if((lastchar == 6) && (filename[0] == '-') && (filename[1] == '.')) 00239 { 00240 filename = "-"; 00241 } 00242 00243 myFilePtr = ifopen(filename, "wb0", InputFile::BGZF); 00244 00245 myInterfacePtr = new BamInterface; 00246 } 00247 else if (lastchar >= 3 && 00248 filename[lastchar - 3] == 'b' && 00249 filename[lastchar - 2] == 'a' && 00250 filename[lastchar - 1] == 'm') 00251 { 00252 // BAM File. 00253 // if -.bam is the filename, write compressed bam to stdout 00254 if((lastchar == 5) && (filename[0] == '-') && (filename[1] == '.')) 00255 { 00256 filename = "-"; 00257 } 00258 myFilePtr = ifopen(filename, "wb", InputFile::BGZF); 00259 00260 myInterfacePtr = new BamInterface; 00261 } 00262 else 00263 { 00264 // SAM File 00265 // if - (followed by anything is the filename, 00266 // write uncompressed sam to stdout 00267 if((lastchar >= 1) && (filename[0] == '-')) 00268 { 00269 filename = "-"; 00270 } 00271 myFilePtr = ifopen(filename, "wb", InputFile::UNCOMPRESSED); 00272 00273 myInterfacePtr = new SamInterface; 00274 } 00275 00276 if (myFilePtr == NULL) 00277 { 00278 std::string errorMessage = "Failed to Open "; 00279 errorMessage += filename; 00280 errorMessage += " for writing"; 00281 myStatus.setStatus(SamStatus::FAIL_IO, errorMessage.c_str()); 00282 return(false); 00283 } 00284 00285 myIsOpenForWrite = true; 00286 00287 // Write the header if one was passed in. 00288 if(header != NULL) 00289 { 00290 return(WriteHeader(*header)); 00291 } 00292 00293 // Successfully opened the file. 00294 myStatus = SamStatus::SUCCESS; 00295 return(true); 00296 } 00297 00298 00299 // Read BAM Index file. 00300 bool SamFile::ReadBamIndex(const char* bamIndexFilename) 00301 { 00302 // Cleanup a previously setup index. 00303 if(myBamIndex != NULL) 00304 { 00305 delete myBamIndex; 00306 myBamIndex = NULL; 00307 } 00308 00309 // Create a new bam index. 00310 myBamIndex = new BamIndex(); 00311 SamStatus::Status indexStat = myBamIndex->readIndex(bamIndexFilename); 00312 00313 if(indexStat != SamStatus::SUCCESS) 00314 { 00315 std::string errorMessage = "Failed to read the bam Index file: "; 00316 errorMessage += bamIndexFilename; 00317 myStatus.setStatus(indexStat, errorMessage.c_str()); 00318 delete myBamIndex; 00319 myBamIndex = NULL; 00320 return(false); 00321 } 00322 myStatus = SamStatus::SUCCESS; 00323 return(true); 00324 } 00325 00326 00327 // Read BAM Index file. 00328 bool SamFile::ReadBamIndex() 00329 { 00330 if(myFilePtr == NULL) 00331 { 00332 // Can't read the bam index file because the BAM file has not yet been 00333 // opened, so we don't know the base filename for the index file. 00334 std::string errorMessage = "Failed to read the bam Index file -" 00335 " the BAM file needs to be read first in order to determine" 00336 " the index filename."; 00337 myStatus.setStatus(SamStatus::FAIL_ORDER, errorMessage.c_str()); 00338 return(false); 00339 } 00340 00341 const char* bamBaseName = myFilePtr->getFileName(); 00342 00343 std::string indexName = bamBaseName; 00344 indexName += ".bai"; 00345 00346 bool foundFile = true; 00347 try 00348 { 00349 if(ReadBamIndex(indexName.c_str()) == false) 00350 { 00351 foundFile = false; 00352 } 00353 } 00354 catch (std::exception&) 00355 { 00356 foundFile = false; 00357 } 00358 00359 // Check to see if the index file was found. 00360 if(!foundFile) 00361 { 00362 // Not found - try without the bam extension. 00363 // Locate the start of the bam extension 00364 size_t startExt = indexName.find(".bam"); 00365 if(startExt == std::string::npos) 00366 { 00367 // Could not find the .bam extension, so just return false since the 00368 // call to ReadBamIndex set the status. 00369 return(false); 00370 } 00371 // Remove ".bam" and try reading the index again. 00372 indexName.erase(startExt, 4); 00373 return(ReadBamIndex(indexName.c_str())); 00374 } 00375 return(true); 00376 } 00377 00378 00379 // Sets the reference to the specified genome sequence object. 00380 void SamFile::SetReference(GenomeSequence* reference) 00381 { 00382 myRefPtr = reference; 00383 } 00384 00385 00386 // Set the type of sequence translation to use when reading the sequence. 00387 void SamFile::SetReadSequenceTranslation(SamRecord::SequenceTranslation translation) 00388 { 00389 myReadTranslation = translation; 00390 } 00391 00392 00393 // Set the type of sequence translation to use when writing the sequence. 00394 void SamFile::SetWriteSequenceTranslation(SamRecord::SequenceTranslation translation) 00395 { 00396 myWriteTranslation = translation; 00397 } 00398 00399 // Close the file if there is one open. 00400 void SamFile::Close() 00401 { 00402 // Resetting the file will close it if it is open, and 00403 // will reset all other variables. 00404 resetFile(); 00405 } 00406 00407 00408 // Returns whether or not the file has been opened. 00409 // return: int - true = open; false = not open. 00410 bool SamFile::IsOpen() 00411 { 00412 if (myFilePtr != NULL) 00413 { 00414 // File Pointer is set, so return if it is open. 00415 return(myFilePtr->isOpen()); 00416 } 00417 // File pointer is not set, so return false, not open. 00418 return false; 00419 } 00420 00421 00422 // Returns whether or not the end of the file has been reached. 00423 // return: int - true = EOF; false = not eof. 00424 bool SamFile::IsEOF() 00425 { 00426 if (myFilePtr != NULL) 00427 { 00428 // File Pointer is set, so return if eof. 00429 return(ifeof(myFilePtr)); 00430 } 00431 // File pointer is not set, so return true, eof. 00432 return true; 00433 } 00434 00435 00436 // Read the header from the currently opened file. 00437 bool SamFile::ReadHeader(SamFileHeader& header) 00438 { 00439 myStatus = SamStatus::SUCCESS; 00440 if(myIsOpenForRead == false) 00441 { 00442 // File is not open for read 00443 myStatus.setStatus(SamStatus::FAIL_ORDER, 00444 "Cannot read header since the file is not open for reading"); 00445 return(false); 00446 } 00447 00448 if(myHasHeader == true) 00449 { 00450 // The header has already been read. 00451 myStatus.setStatus(SamStatus::FAIL_ORDER, 00452 "Cannot read header since it has already been read."); 00453 return(false); 00454 } 00455 00456 if(myInterfacePtr->readHeader(myFilePtr, header, myStatus)) 00457 { 00458 // The header has now been successfully read. 00459 myHasHeader = true; 00460 return(true); 00461 } 00462 return(false); 00463 } 00464 00465 00466 // Write the header to the currently opened file. 00467 bool SamFile::WriteHeader(SamFileHeader& header) 00468 { 00469 myStatus = SamStatus::SUCCESS; 00470 if(myIsOpenForWrite == false) 00471 { 00472 // File is not open for write 00473 // -OR- 00474 // The header has already been written. 00475 myStatus.setStatus(SamStatus::FAIL_ORDER, 00476 "Cannot write header since the file is not open for writing"); 00477 return(false); 00478 } 00479 00480 if(myHasHeader == true) 00481 { 00482 // The header has already been written. 00483 myStatus.setStatus(SamStatus::FAIL_ORDER, 00484 "Cannot write header since it has already been written"); 00485 return(false); 00486 } 00487 00488 if(myInterfacePtr->writeHeader(myFilePtr, header, myStatus)) 00489 { 00490 // The header has now been successfully written. 00491 myHasHeader = true; 00492 return(true); 00493 } 00494 00495 // return the status. 00496 return(false); 00497 } 00498 00499 00500 // Read a record from the currently opened file. 00501 bool SamFile::ReadRecord(SamFileHeader& header, 00502 SamRecord& record) 00503 { 00504 myStatus = SamStatus::SUCCESS; 00505 00506 if(myIsOpenForRead == false) 00507 { 00508 // File is not open for read 00509 myStatus.setStatus(SamStatus::FAIL_ORDER, 00510 "Cannot read record since the file is not open for reading"); 00511 throw(std::runtime_error("SOFTWARE BUG: trying to read a SAM/BAM record prior to opening the file.")); 00512 return(false); 00513 } 00514 00515 if(myHasHeader == false) 00516 { 00517 // The header has not yet been read. 00518 // TODO - maybe just read the header. 00519 myStatus.setStatus(SamStatus::FAIL_ORDER, 00520 "Cannot read record since the header has not been read."); 00521 throw(std::runtime_error("SOFTWARE BUG: trying to read a SAM/BAM record prior to reading the header.")); 00522 return(false); 00523 } 00524 00525 // Check to see if a new region has been set. If so, determine the 00526 // chunks for that region. 00527 if(myNewSection) 00528 { 00529 if(!processNewSection(header)) 00530 { 00531 // Failed processing a new section. Could be an 00532 // order issue like the file not being open or the 00533 // indexed file not having been read. 00534 // processNewSection sets myStatus with the failure reason. 00535 return(false); 00536 } 00537 } 00538 00539 // Read until a record is not successfully read or there are no more 00540 // requested records. 00541 while(myStatus == SamStatus::SUCCESS) 00542 { 00543 record.setReference(myRefPtr); 00544 record.setSequenceTranslation(myReadTranslation); 00545 00546 // If reading by index, this method will setup to ensure it is in 00547 // the correct position for the next record (if not already there). 00548 // Sets myStatus if it could not move to a good section. 00549 // Just returns true if it is not setup to read by index. 00550 if(!ensureIndexedReadPosition()) 00551 { 00552 // Either there are no more records in the section 00553 // or it failed to move to the right section, so there 00554 // is nothing more to read, stop looping. 00555 break; 00556 } 00557 00558 // File is open for reading and the header has been read, so read the 00559 // next record. 00560 myInterfacePtr->readRecord(myFilePtr, header, record, myStatus); 00561 if(myStatus != SamStatus::SUCCESS) 00562 { 00563 // Failed to read the record, so break out of the loop. 00564 break; 00565 } 00566 00567 // Successfully read a record, so check if we should filter it. 00568 // First check if it is out of the section. Returns true 00569 // if not reading by sections, returns false if the record 00570 // is outside of the section. Sets status to NO_MORE_RECS if 00571 // there is nothing left ot read in the section. 00572 if(!checkRecordInSection(record)) 00573 { 00574 // The record is not in the section. 00575 // The while loop will detect if NO_MORE_RECS was set. 00576 continue; 00577 } 00578 00579 // Check the flag for required/excluded flags. 00580 uint16_t flag = record.getFlag(); 00581 if((flag & myRequiredFlags) != myRequiredFlags) 00582 { 00583 // The record does not conatain all required flags, so 00584 // continue looking. 00585 continue; 00586 } 00587 if((flag & myExcludedFlags) != 0) 00588 { 00589 // The record contains an excluded flag, so continue looking. 00590 continue; 00591 } 00592 00593 //increment the record count. 00594 myRecordCount++; 00595 00596 if(myStatistics != NULL) 00597 { 00598 // Statistics should be updated. 00599 myStatistics->updateStatistics(record); 00600 } 00601 00602 // Successfully read the record, so check the sort order. 00603 if(!validateSortOrder(record, header)) 00604 { 00605 // ValidateSortOrder sets the status on a failure. 00606 return(false); 00607 } 00608 return(true); 00609 00610 } // End while loop that checks if a desired record is found or failure. 00611 00612 // Return true if a record was found. 00613 return(myStatus == SamStatus::SUCCESS); 00614 } 00615 00616 00617 00618 // Write a record to the currently opened file. 00619 bool SamFile::WriteRecord(SamFileHeader& header, 00620 SamRecord& record) 00621 { 00622 if(myIsOpenForWrite == false) 00623 { 00624 // File is not open for writing 00625 myStatus.setStatus(SamStatus::FAIL_ORDER, 00626 "Cannot write record since the file is not open for writing"); 00627 return(false); 00628 } 00629 00630 if(myHasHeader == false) 00631 { 00632 // The header has not yet been written. 00633 myStatus.setStatus(SamStatus::FAIL_ORDER, 00634 "Cannot write record since the header has not been written"); 00635 return(false); 00636 } 00637 00638 // Before trying to write the record, validate the sort order. 00639 if(!validateSortOrder(record, header)) 00640 { 00641 // Not sorted like it is supposed to be, do not write the record 00642 myStatus.setStatus(SamStatus::INVALID_SORT, 00643 "Cannot write the record since the file is not properly sorted."); 00644 return(false); 00645 } 00646 00647 if(myRefPtr != NULL) 00648 { 00649 record.setReference(myRefPtr); 00650 } 00651 00652 // File is open for writing and the header has been written, so write the 00653 // record. 00654 myStatus = myInterfacePtr->writeRecord(myFilePtr, header, record, 00655 myWriteTranslation); 00656 00657 if(myStatus == SamStatus::SUCCESS) 00658 { 00659 // A record was successfully written, so increment the record count. 00660 myRecordCount++; 00661 return(true); 00662 } 00663 return(false); 00664 } 00665 00666 00667 // Set the flag to validate that the file is sorted as it is read/written. 00668 // Must be called after the file has been opened. 00669 void SamFile::setSortedValidation(SortedType sortType) 00670 { 00671 mySortedType = sortType; 00672 } 00673 00674 00675 // Return the number of records that have been read/written so far. 00676 uint32_t SamFile::GetCurrentRecordCount() 00677 { 00678 return(myRecordCount); 00679 } 00680 00681 00682 // Sets what part of the SamFile should be read. 00683 bool SamFile::SetReadSection(int32_t refID) 00684 { 00685 // No start/end specified, so set back to default -1. 00686 return(SetReadSection(refID, -1, -1)); 00687 } 00688 00689 00690 00691 // Sets what part of the SamFile should be read. 00692 bool SamFile::SetReadSection(const char* refName) 00693 { 00694 // No start/end specified, so set back to default -1. 00695 return(SetReadSection(refName, -1, -1)); 00696 } 00697 00698 00699 // Sets what part of the BAM file should be read. 00700 bool SamFile::SetReadSection(int32_t refID, int32_t start, int32_t end, 00701 bool overlap) 00702 { 00703 // If there is not a BAM file open for reading, return failure. 00704 // Opening a new file clears the read section, so it must be 00705 // set after the file is opened. 00706 if(!myIsBamOpenForRead) 00707 { 00708 // There is not a BAM file open for reading. 00709 myStatus.setStatus(SamStatus::FAIL_ORDER, 00710 "Cannot set section since there is no bam file open"); 00711 return(false); 00712 } 00713 00714 myNewSection = true; 00715 myOverlapSection = overlap; 00716 myStartPos = start; 00717 myEndPos = end; 00718 myRefID = refID; 00719 myRefName.clear(); 00720 myChunksToRead.clear(); 00721 // Reset the end of the current chunk. We are resetting our read, so 00722 // we no longer have a "current chunk" that we are reading. 00723 myCurrentChunkEnd = 0; 00724 myStatus = SamStatus::SUCCESS; 00725 00726 // Reset the sort order criteria since we moved around in the file. 00727 myPrevCoord = -1; 00728 myPrevRefID = 0; 00729 myPrevReadName.Clear(); 00730 00731 return(true); 00732 } 00733 00734 00735 // Sets what part of the BAM file should be read. 00736 bool SamFile::SetReadSection(const char* refName, int32_t start, int32_t end, 00737 bool overlap) 00738 { 00739 // If there is not a BAM file open for reading, return failure. 00740 // Opening a new file clears the read section, so it must be 00741 // set after the file is opened. 00742 if(!myIsBamOpenForRead) 00743 { 00744 // There is not a BAM file open for reading. 00745 myStatus.setStatus(SamStatus::FAIL_ORDER, 00746 "Cannot set section since there is no bam file open"); 00747 return(false); 00748 } 00749 00750 myNewSection = true; 00751 myOverlapSection = overlap; 00752 myStartPos = start; 00753 myEndPos = end; 00754 if((strcmp(refName, "") == 0) || (strcmp(refName, "*") == 0)) 00755 { 00756 // No Reference name specified, so read just the "-1" entries. 00757 myRefID = BamIndex::REF_ID_UNMAPPED; 00758 } 00759 else 00760 { 00761 // save the reference name and revert the reference ID to unknown 00762 // so it will be calculated later. 00763 myRefName = refName; 00764 myRefID = BamIndex::REF_ID_ALL; 00765 } 00766 myChunksToRead.clear(); 00767 // Reset the end of the current chunk. We are resetting our read, so 00768 // we no longer have a "current chunk" that we are reading. 00769 myCurrentChunkEnd = 0; 00770 myStatus = SamStatus::SUCCESS; 00771 00772 // Reset the sort order criteria since we moved around in the file. 00773 myPrevCoord = -1; 00774 myPrevRefID = 0; 00775 myPrevReadName.Clear(); 00776 00777 return(true); 00778 } 00779 00780 00781 void SamFile::SetReadFlags(uint16_t requiredFlags, uint16_t excludedFlags) 00782 { 00783 myRequiredFlags = requiredFlags; 00784 myExcludedFlags = excludedFlags; 00785 } 00786 00787 00788 // Get the number of mapped reads in the specified reference id. 00789 // Returns -1 for out of range refIDs. 00790 int32_t SamFile::getNumMappedReadsFromIndex(int32_t refID) 00791 { 00792 // The bam index must have already been read. 00793 if(myBamIndex == NULL) 00794 { 00795 myStatus.setStatus(SamStatus::FAIL_ORDER, 00796 "Cannot get num mapped reads from the index until it has been read."); 00797 return(false); 00798 } 00799 return(myBamIndex->getNumMappedReads(refID)); 00800 } 00801 00802 00803 // Get the number of unmapped reads in the specified reference id. 00804 // Returns -1 for out of range refIDs. 00805 int32_t SamFile::getNumUnMappedReadsFromIndex(int32_t refID) 00806 { 00807 // The bam index must have already been read. 00808 if(myBamIndex == NULL) 00809 { 00810 myStatus.setStatus(SamStatus::FAIL_ORDER, 00811 "Cannot get num unmapped reads from the index until it has been read."); 00812 return(false); 00813 } 00814 return(myBamIndex->getNumUnMappedReads(refID)); 00815 } 00816 00817 00818 // Get the number of mapped reads in the specified reference id. 00819 // Returns -1 for out of range references. 00820 int32_t SamFile::getNumMappedReadsFromIndex(const char* refName, 00821 SamFileHeader& header) 00822 { 00823 // The bam index must have already been read. 00824 if(myBamIndex == NULL) 00825 { 00826 myStatus.setStatus(SamStatus::FAIL_ORDER, 00827 "Cannot get num mapped reads from the index until it has been read."); 00828 return(false); 00829 } 00830 int32_t refID = BamIndex::REF_ID_UNMAPPED; 00831 if((strcmp(refName, "") != 0) && (strcmp(refName, "*") != 0)) 00832 { 00833 // Reference name specified, so read just the "-1" entries. 00834 refID = header.getReferenceID(refName); 00835 } 00836 return(myBamIndex->getNumMappedReads(refID)); 00837 } 00838 00839 00840 // Get the number of unmapped reads in the specified reference id. 00841 // Returns -1 for out of range refIDs. 00842 int32_t SamFile::getNumUnMappedReadsFromIndex(const char* refName, 00843 SamFileHeader& header) 00844 { 00845 // The bam index must have already been read. 00846 if(myBamIndex == NULL) 00847 { 00848 myStatus.setStatus(SamStatus::FAIL_ORDER, 00849 "Cannot get num unmapped reads from the index until it has been read."); 00850 return(false); 00851 } 00852 int32_t refID = BamIndex::REF_ID_UNMAPPED; 00853 if((strcmp(refName, "") != 0) && (strcmp(refName, "*") != 0)) 00854 { 00855 // Reference name specified, so read just the "-1" entries. 00856 refID = header.getReferenceID(refName); 00857 } 00858 return(myBamIndex->getNumUnMappedReads(refID)); 00859 } 00860 00861 00862 // Returns the number of bases in the passed in read that overlap the 00863 // region that is currently set. 00864 uint32_t SamFile::GetNumOverlaps(SamRecord& samRecord) 00865 { 00866 if(myRefPtr != NULL) 00867 { 00868 samRecord.setReference(myRefPtr); 00869 } 00870 samRecord.setSequenceTranslation(myReadTranslation); 00871 00872 // Get the overlaps in the sam record for the region currently set 00873 // for this file. 00874 return(samRecord.getNumOverlaps(myStartPos, myEndPos)); 00875 } 00876 00877 00878 void SamFile::GenerateStatistics(bool genStats) 00879 { 00880 if(genStats) 00881 { 00882 if(myStatistics == NULL) 00883 { 00884 // Want to generate statistics, but do not yet have the 00885 // structure for them, so create one. 00886 myStatistics = new SamStatistics(); 00887 } 00888 } 00889 else 00890 { 00891 // Do not generate statistics, so if myStatistics is not NULL, 00892 // delete it. 00893 if(myStatistics != NULL) 00894 { 00895 delete myStatistics; 00896 myStatistics = NULL; 00897 } 00898 } 00899 00900 } 00901 00902 00903 const BamIndex* SamFile::GetBamIndex() 00904 { 00905 return(myBamIndex); 00906 } 00907 00908 00909 // initialize. 00910 void SamFile::init() 00911 { 00912 myFilePtr = NULL; 00913 myInterfacePtr = NULL; 00914 myStatistics = NULL; 00915 myBamIndex = NULL; 00916 myRefPtr = NULL; 00917 myReadTranslation = SamRecord::NONE; 00918 myWriteTranslation = SamRecord::NONE; 00919 myAttemptRecovery = false; 00920 myRequiredFlags = 0; 00921 myExcludedFlags = 0; 00922 } 00923 00924 00925 void SamFile::init(const char* filename, OpenType mode, SamFileHeader* header) 00926 { 00927 init(); 00928 00929 resetFile(); 00930 00931 bool openStatus = true; 00932 if(mode == READ) 00933 { 00934 // open the file for read. 00935 openStatus = OpenForRead(filename, header); 00936 } 00937 else 00938 { 00939 // open the file for write. 00940 openStatus = OpenForWrite(filename, header); 00941 } 00942 if(!openStatus) 00943 { 00944 // Failed to open the file - print error and abort. 00945 fprintf(stderr, "%s\n", GetStatusMessage()); 00946 std::cerr << "FAILURE - EXITING!!!" << std::endl; 00947 exit(-1); 00948 } 00949 } 00950 00951 00952 // Reset variables for each file. 00953 void SamFile::resetFile() 00954 { 00955 // Close the file. 00956 if (myFilePtr != NULL) 00957 { 00958 // If we already have an open file, close it. 00959 ifclose(myFilePtr); 00960 myFilePtr = NULL; 00961 } 00962 if(myInterfacePtr != NULL) 00963 { 00964 delete myInterfacePtr; 00965 myInterfacePtr = NULL; 00966 } 00967 00968 myIsOpenForRead = false; 00969 myIsOpenForWrite = false; 00970 myHasHeader = false; 00971 mySortedType = UNSORTED; 00972 myPrevReadName.Clear(); 00973 myPrevCoord = -1; 00974 myPrevRefID = 0; 00975 myRecordCount = 0; 00976 myStatus = SamStatus::SUCCESS; 00977 00978 // Reset indexed bam values. 00979 myIsBamOpenForRead = false; 00980 myRefID = BamIndex::REF_ID_ALL; 00981 myStartPos = -1; 00982 myEndPos = -1; 00983 myNewSection = false; 00984 myOverlapSection = true; 00985 myCurrentChunkEnd = 0; 00986 myChunksToRead.clear(); 00987 if(myBamIndex != NULL) 00988 { 00989 delete myBamIndex; 00990 myBamIndex = NULL; 00991 } 00992 00993 // If statistics are being generated, reset them. 00994 if(myStatistics != NULL) 00995 { 00996 myStatistics->reset(); 00997 } 00998 00999 myRefName.clear(); 01000 } 01001 01002 01003 // Validate that the record is sorted compared to the previously read record 01004 // if there is one, according to the specified sort order. 01005 // If the sort order is UNSORTED, true is returned. 01006 bool SamFile::validateSortOrder(SamRecord& record, SamFileHeader& header) 01007 { 01008 if(myRefPtr != NULL) 01009 { 01010 record.setReference(myRefPtr); 01011 } 01012 record.setSequenceTranslation(myReadTranslation); 01013 01014 bool status = false; 01015 if(mySortedType == UNSORTED) 01016 { 01017 // Unsorted, so nothing to validate, just return true. 01018 status = true; 01019 } 01020 else 01021 { 01022 // Check to see if mySortedType is based on the header. 01023 if(mySortedType == FLAG) 01024 { 01025 // Determine the sorted type from what was read out of the header. 01026 mySortedType = getSortOrderFromHeader(header); 01027 } 01028 01029 if(mySortedType == QUERY_NAME) 01030 { 01031 // Validate that it is sorted by query name. 01032 // Get the query name from the record. 01033 const char* readName = record.getReadName(); 01034 01035 // Check if it is sorted either in samtools way or picard's way. 01036 if((myPrevReadName.Compare(readName) > 0) && 01037 (strcmp(myPrevReadName.c_str(), readName) > 0)) 01038 { 01039 // return false. 01040 String errorMessage = "ERROR: File is not sorted by read name at record "; 01041 errorMessage += myRecordCount; 01042 errorMessage += "\n\tPrevious record was "; 01043 errorMessage += myPrevReadName; 01044 errorMessage += ", but this record is "; 01045 errorMessage += readName; 01046 myStatus.setStatus(SamStatus::INVALID_SORT, 01047 errorMessage.c_str()); 01048 status = false; 01049 } 01050 else 01051 { 01052 myPrevReadName = readName; 01053 status = true; 01054 } 01055 } 01056 else 01057 { 01058 // Validate that it is sorted by COORDINATES. 01059 // Get the leftmost coordinate and the reference index. 01060 int32_t refID = record.getReferenceID(); 01061 int32_t coord = record.get0BasedPosition(); 01062 // The unmapped reference id is at the end of a sorted file. 01063 if(refID == BamIndex::REF_ID_UNMAPPED) 01064 { 01065 // A new reference ID that is for the unmapped reads 01066 // is always valid. 01067 status = true; 01068 myPrevRefID = refID; 01069 myPrevCoord = coord; 01070 } 01071 else if(myPrevRefID == BamIndex::REF_ID_UNMAPPED) 01072 { 01073 // Previous reference ID was for unmapped reads, but the 01074 // current one is not, so this is not sorted. 01075 String errorMessage = "ERROR: File is not coordinate sorted at record "; 01076 errorMessage += myRecordCount; 01077 errorMessage += "\n\tPrevious record was unmapped, but this record is "; 01078 errorMessage += header.getReferenceLabel(refID) + ":" + coord; 01079 myStatus.setStatus(SamStatus::INVALID_SORT, 01080 errorMessage.c_str()); 01081 status = false; 01082 } 01083 else if(refID < myPrevRefID) 01084 { 01085 // Current reference id is less than the previous one, 01086 //meaning that it is not sorted. 01087 String errorMessage = "ERROR: File is not coordinate sorted at record "; 01088 errorMessage += myRecordCount; 01089 errorMessage += "\n\tPrevious record was "; 01090 errorMessage += header.getReferenceLabel(myPrevRefID) + ":" + myPrevCoord; 01091 errorMessage += ", but this record is "; 01092 errorMessage += header.getReferenceLabel(refID) + ":" + coord; 01093 myStatus.setStatus(SamStatus::INVALID_SORT, 01094 errorMessage.c_str()); 01095 status = false; 01096 } 01097 else 01098 { 01099 // The reference IDs are in the correct order. 01100 if(refID > myPrevRefID) 01101 { 01102 // New reference id, so set the previous coordinate to -1 01103 myPrevCoord = -1; 01104 } 01105 01106 // Check the coordinates. 01107 if(coord < myPrevCoord) 01108 { 01109 // New Coord is less than the previous position. 01110 String errorMessage = "ERROR: File is not coordinate sorted at record "; 01111 errorMessage += myRecordCount; 01112 errorMessage += "\n\tPreviousRecord was "; 01113 errorMessage += header.getReferenceLabel(myPrevRefID) + ":" + myPrevCoord; 01114 errorMessage += ", but this record is "; 01115 errorMessage += header.getReferenceLabel(refID) + ":" + coord; 01116 myStatus.setStatus(SamStatus::INVALID_SORT, 01117 errorMessage.c_str()); 01118 status = false; 01119 } 01120 else 01121 { 01122 myPrevRefID = refID; 01123 myPrevCoord = coord; 01124 status = true; 01125 } 01126 } 01127 } 01128 } 01129 01130 return(status); 01131 } 01132 01133 01134 SamFile::SortedType SamFile::getSortOrderFromHeader(SamFileHeader& header) 01135 { 01136 const char* tag = header.getSortOrder(); 01137 01138 // Default to unsorted since if it is not specified in the header 01139 // that is the value that should be used. 01140 SortedType headerSortOrder = UNSORTED; 01141 if(strcmp(tag, "queryname") == 0) 01142 { 01143 headerSortOrder = QUERY_NAME; 01144 } 01145 else if(strcmp(tag, "coordinate") == 0) 01146 { 01147 headerSortOrder = COORDINATE; 01148 } 01149 return(headerSortOrder); 01150 } 01151 01152 01153 bool SamFile::ensureIndexedReadPosition() 01154 { 01155 // If no sections are specified, return true. 01156 if(myRefID == BamIndex::REF_ID_ALL) 01157 { 01158 return(true); 01159 } 01160 01161 // Check to see if we have more to read out of the current chunk. 01162 // By checking the current position in relation to the current 01163 // end chunk. If the current position is >= the end of the 01164 // current chunk, then we must see to the next chunk. 01165 uint64_t currentPos = iftell(myFilePtr); 01166 if(currentPos >= myCurrentChunkEnd) 01167 { 01168 // If there are no more chunks to process, return failure. 01169 if(myChunksToRead.empty()) 01170 { 01171 myStatus = SamStatus::NO_MORE_RECS; 01172 return(false); 01173 } 01174 01175 // There are more chunks left, so get the next chunk. 01176 Chunk newChunk = myChunksToRead.pop(); 01177 01178 // Move to the location of the new chunk if it is not adjacent 01179 // to the current chunk. 01180 if(newChunk.chunk_beg != currentPos) 01181 { 01182 // New chunk is not adjacent, so move to it. 01183 if(ifseek(myFilePtr, newChunk.chunk_beg, SEEK_SET) != true) 01184 { 01185 // seek failed, cannot retrieve next record, return failure. 01186 myStatus.setStatus(SamStatus::FAIL_IO, 01187 "Failed to seek to the next record"); 01188 return(false); 01189 } 01190 } 01191 // Seek succeeded, set the end of the new chunk. 01192 myCurrentChunkEnd = newChunk.chunk_end; 01193 } 01194 return(true); 01195 } 01196 01197 01198 bool SamFile::checkRecordInSection(SamRecord& record) 01199 { 01200 bool recordFound = true; 01201 if(myRefID == BamIndex::REF_ID_ALL) 01202 { 01203 return(true); 01204 } 01205 // Check to see if it is in the correct reference/position. 01206 if(record.getReferenceID() != myRefID) 01207 { 01208 // Incorrect reference ID, return no more records. 01209 myStatus = SamStatus::NO_MORE_RECS; 01210 return(false); 01211 } 01212 01213 // Found a record. 01214 recordFound = true; 01215 01216 // If start/end position are set, verify that the alignment falls 01217 // within those. 01218 // If the alignment start is greater than the end of the region, 01219 // return NO_MORE_RECS. 01220 // Since myEndPos is Exclusive 0-based, anything >= myEndPos is outside 01221 // of the region. 01222 if((myEndPos != -1) && (record.get0BasedPosition() >= myEndPos)) 01223 { 01224 myStatus = SamStatus::NO_MORE_RECS; 01225 return(false); 01226 } 01227 01228 // We know the start is less than the end position, so the alignment 01229 // overlaps the region if the alignment end position is greater than the 01230 // start of the region. 01231 if((myStartPos != -1) && (record.get0BasedAlignmentEnd() < myStartPos)) 01232 { 01233 // If it does not overlap the region, so go to the next 01234 // record...set recordFound back to false. 01235 recordFound = false; 01236 } 01237 01238 if(!myOverlapSection) 01239 { 01240 // Needs to be fully contained. Not fully contained if 01241 // 1) the record start position is < the region start position. 01242 // or 01243 // 2) the end position is specified and the record end position 01244 // is greater than or equal to the region end position. 01245 // (equal to since the region is exclusive. 01246 if((record.get0BasedPosition() < myStartPos) || 01247 ((myEndPos != -1) && 01248 (record.get0BasedAlignmentEnd() >= myEndPos))) 01249 { 01250 // This record is not fully contained, so move on to the next 01251 // record. 01252 recordFound = false; 01253 } 01254 } 01255 01256 return(recordFound); 01257 } 01258 01259 01260 bool SamFile::processNewSection(SamFileHeader &header) 01261 { 01262 myNewSection = false; 01263 01264 // If there is no index file, return failure. 01265 if(myBamIndex == NULL) 01266 { 01267 // No bam index has been read. 01268 myStatus.setStatus(SamStatus::FAIL_ORDER, 01269 "Cannot read section since there is no index file open"); 01270 throw(std::runtime_error("SOFTWARE BUG: trying to read a BAM record by section prior to opening the BAM Index file.")); 01271 return(false); 01272 } 01273 01274 // If there is not a BAM file open for reading, return failure. 01275 if(!myIsBamOpenForRead) 01276 { 01277 // There is not a BAM file open for reading. 01278 myStatus.setStatus(SamStatus::FAIL_ORDER, 01279 "Cannot read section since there is no bam file open"); 01280 throw(std::runtime_error("SOFTWARE BUG: trying to read a BAM record by section without opening a BAM file.")); 01281 return(false); 01282 } 01283 01284 if(myHasHeader == false) 01285 { 01286 // The header has not yet been read. 01287 myStatus.setStatus(SamStatus::FAIL_ORDER, 01288 "Cannot read record since the header has not been read."); 01289 throw(std::runtime_error("SOFTWARE BUG: trying to read a BAM record by section prior to opening the header.")); 01290 return(false); 01291 } 01292 01293 // Indexed Bam open for read, so disable read buffering because iftell 01294 // will be used. 01295 // Needs to be done here after we already know that the header has been 01296 // read. 01297 myFilePtr->disableBuffering(); 01298 01299 myChunksToRead.clear(); 01300 // Reset the end of the current chunk. We are resetting our read, so 01301 // we no longer have a "current chunk" that we are reading. 01302 myCurrentChunkEnd = 0; 01303 01304 // Check to see if the read section was set based on the reference name 01305 // but not yet converted to reference id. 01306 if(!myRefName.empty()) 01307 { 01308 myRefID = header.getReferenceID(myRefName.c_str()); 01309 // Clear the myRefName length so this code is only executed once. 01310 myRefName.clear(); 01311 01312 // Check to see if a reference id was found. 01313 if(myRefID == SamReferenceInfo::NO_REF_ID) 01314 { 01315 myStatus = SamStatus::NO_MORE_RECS; 01316 return(false); 01317 } 01318 } 01319 01320 // Get the chunks associated with this reference region. 01321 if(myBamIndex->getChunksForRegion(myRefID, myStartPos, myEndPos, 01322 myChunksToRead) == true) 01323 { 01324 myStatus = SamStatus::SUCCESS; 01325 } 01326 else 01327 { 01328 String errorMsg = "Failed to get the specified region, refID = "; 01329 errorMsg += myRefID; 01330 errorMsg += "; startPos = "; 01331 errorMsg += myStartPos; 01332 errorMsg += "; endPos = "; 01333 errorMsg += myEndPos; 01334 myStatus.setStatus(SamStatus::FAIL_PARSE, 01335 errorMsg); 01336 } 01337 return(true); 01338 } 01339 01340 // 01341 // When the caller to SamFile::ReadRecord() catches an 01342 // exception, it may choose to call this method to resync 01343 // on the underlying binary stream. 01344 // 01345 // Arguments: a callback function that will requires length bytes 01346 // of data to validate a record header. 01347 // 01348 // The expected use case is to re-sync on the next probably valid 01349 // BAM record, so that we can resume reading even after detecting 01350 // a corrupted BAM file. 01351 // 01352 bool SamFile::attemptRecoverySync(bool (*checkSignature)(void *data) , int length) 01353 { 01354 if(myFilePtr==NULL) return false; 01355 // non-recovery aware objects will just return false: 01356 return myFilePtr->attemptRecoverySync(checkSignature, length); 01357 } 01358 01359 // Default Constructor. 01360 SamFileReader::SamFileReader() 01361 : SamFile() 01362 { 01363 } 01364 01365 01366 // Constructor that opens the specified file for read. 01367 SamFileReader::SamFileReader(const char* filename) 01368 : SamFile(filename, READ) 01369 { 01370 } 01371 01372 01373 01374 01375 // Constructor that opens the specified file for read. 01376 SamFileReader::SamFileReader(const char* filename, 01377 ErrorHandler::HandlingType errorHandlingType) 01378 : SamFile(filename, READ, errorHandlingType) 01379 { 01380 } 01381 01382 01383 // Constructor that opens the specified file for read. 01384 SamFileReader::SamFileReader(const char* filename, 01385 SamFileHeader* header) 01386 : SamFile(filename, READ, header) 01387 { 01388 } 01389 01390 01391 // Constructor that opens the specified file for read. 01392 SamFileReader::SamFileReader(const char* filename, 01393 ErrorHandler::HandlingType errorHandlingType, 01394 SamFileHeader* header) 01395 : SamFile(filename, READ, errorHandlingType, header) 01396 { 01397 } 01398 01399 01400 SamFileReader::~SamFileReader() 01401 { 01402 } 01403 01404 01405 // Default Constructor. 01406 SamFileWriter::SamFileWriter() 01407 : SamFile() 01408 { 01409 } 01410 01411 01412 // Constructor that opens the specified file for write. 01413 SamFileWriter::SamFileWriter(const char* filename) 01414 : SamFile(filename, WRITE) 01415 { 01416 } 01417 01418 01419 // Constructor that opens the specified file for write. 01420 SamFileWriter::SamFileWriter(const char* filename, 01421 ErrorHandler::HandlingType errorHandlingType) 01422 : SamFile(filename, WRITE, errorHandlingType) 01423 { 01424 } 01425 01426 01427 // Constructor that opens the specified file for write. 01428 SamFileWriter::SamFileWriter(const char* filename, 01429 SamFileHeader* header) 01430 : SamFile(filename, WRITE, header) 01431 { 01432 } 01433 01434 01435 // Constructor that opens the specified file for write. 01436 SamFileWriter::SamFileWriter(const char* filename, 01437 ErrorHandler::HandlingType errorHandlingType, 01438 SamFileHeader* header) 01439 : SamFile(filename, WRITE, errorHandlingType, header) 01440 { 01441 } 01442 01443 01444 SamFileWriter::~SamFileWriter() 01445 { 01446 }