SamFile.cpp

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 #include <stdexcept>
00018 #include "SamFile.h"
00019 #include "SamFileHeader.h"
00020 #include "SamRecord.h"
00021 #include "BamInterface.h"
00022 #include "SamInterface.h"
00023 
00024 // Constructor, init variables.
00025 SamFile::SamFile()
00026     : myFilePtr(NULL),
00027       myInterfacePtr(NULL),
00028       myStatistics(NULL),
00029       myStatus(),
00030       myBamIndex(NULL)
00031 {
00032     resetFile();
00033 }
00034 
00035 
00036 // Constructor, init variables.
00037 SamFile::SamFile(ErrorHandler::HandlingType errorHandlingType)
00038     : myFilePtr(NULL),
00039       myInterfacePtr(NULL),
00040       myStatistics(NULL),
00041       myStatus(errorHandlingType),
00042       myBamIndex(NULL)
00043 {
00044     resetFile();
00045 }
00046 
00047 
00048 // Constructor, init variables and open the specified file based on the
00049 // specified mode (READ/WRITE).
00050 SamFile::SamFile(const char* filename, OpenType mode)
00051     : myFilePtr(NULL),
00052       myInterfacePtr(NULL),
00053       myStatistics(NULL),
00054       myStatus(),
00055       myBamIndex(NULL)
00056 {
00057     resetFile();
00058 
00059     bool openStatus = true;
00060     if(mode == READ)
00061     {
00062         // open the file for read.
00063         openStatus = OpenForRead(filename);
00064     }
00065     else
00066     {
00067         // open the file for write.
00068         openStatus = OpenForWrite(filename);
00069     }
00070     if(!openStatus)
00071     {
00072         // Failed to open the file - print error and abort.
00073         fprintf(stderr, "%s\n", GetStatusMessage());
00074         std::cerr << "FAILURE - EXITING!!!" << std::endl;
00075         exit(-1);
00076     }
00077 }
00078 
00079 
00080 // Constructor, init variables and open the specified file based on the
00081 // specified mode (READ/WRITE).  Default is READ..
00082 SamFile::SamFile(const char* filename, OpenType mode,
00083                  ErrorHandler::HandlingType errorHandlingType)
00084     : myFilePtr(NULL),
00085       myInterfacePtr(NULL),
00086       myStatistics(NULL),
00087       myStatus(errorHandlingType),
00088       myBamIndex(NULL)
00089 {
00090     resetFile();
00091 
00092     bool openStatus = true;
00093     if(mode == READ)
00094     {
00095         // open the file for read.
00096         openStatus = OpenForRead(filename);
00097     }
00098     else
00099     {
00100         // open the file for write.
00101         openStatus = OpenForWrite(filename);
00102     }
00103     if(!openStatus)
00104     {
00105         // Failed to open the file - print error and abort.
00106         fprintf(stderr, "%s\n", GetStatusMessage());
00107         std::cerr << "FAILURE - EXITING!!!" << std::endl;
00108         exit(-1);
00109     }
00110 }
00111 
00112 
00113 SamFile::~SamFile()
00114 {
00115     resetFile();
00116     if(myStatistics != NULL)
00117     {
00118         delete myStatistics;
00119     }
00120 }
00121 
00122 
00123 // Open a sam/bam file for reading with the specified filename.
00124 bool SamFile::OpenForRead(const char * filename)
00125 {
00126     // Reset for any previously operated on files.
00127     resetFile();
00128 
00129     int lastchar = 0;
00130 
00131     while (filename[lastchar] != 0) lastchar++;
00132 
00133     // If at least one character, check for '-'.
00134     if((lastchar >= 1) && (filename[0] == '-'))
00135     {
00136         // Read from stdin - determine type of file to read.
00137         // Determine if compressed bam.
00138         if(strcmp(filename, "-.bam") == 0)
00139         {
00140             // Compressed bam - open as bgzf.
00141             // -.bam is the filename, read compressed bam from stdin
00142             filename = "-";
00143             myFilePtr = ifopen(filename, "rb", InputFile::BGZF);
00144             
00145             myInterfacePtr = new BamInterface;
00146 
00147             // Read the magic string.
00148             char magic[4];
00149             ifread(myFilePtr, magic, 4);
00150         }
00151         else if(strcmp(filename, "-.ubam") == 0)
00152         {
00153             // uncompressed BAM File.
00154             // -.ubam is the filename, read uncompressed bam from stdin
00155             filename = "-";
00156             myFilePtr = ifopen(filename, "rb", InputFile::UNCOMPRESSED);
00157         
00158             myInterfacePtr = new BamInterface;
00159 
00160             // Read the magic string.
00161             char magic[4];
00162             ifread(myFilePtr, magic, 4);
00163         }
00164         else
00165         {
00166             // SAM File.
00167             // read sam from stdin
00168             filename = "-";
00169             myFilePtr = ifopen(filename, "rb", InputFile::UNCOMPRESSED);
00170             myInterfacePtr = new SamInterface;
00171         }
00172     }
00173     else
00174     {
00175         // Not from stdin.  Read the file to determine the type.
00176         myFilePtr = ifopen(filename, "rb");
00177         
00178         if (myFilePtr == NULL)
00179         {
00180             std::string errorMessage = "Failed to Open ";
00181             errorMessage += filename;
00182             errorMessage += " for reading";
00183             myStatus.setStatus(SamStatus::FAIL_IO, errorMessage.c_str());
00184             return(false);
00185         }
00186         
00187         char magic[4];
00188         ifread(myFilePtr, magic, 4);
00189         
00190         if (magic[0] == 'B' && magic[1] == 'A' && magic[2] == 'M' &&
00191             magic[3] == 1)
00192         {
00193             myInterfacePtr = new BamInterface;
00194             // Set that it is a bam file open for reading.  This is needed to
00195             // determine if an index file can be used.
00196             myIsBamOpenForRead = true;
00197         }
00198         else
00199         {
00200             // Not a bam, so rewind to the beginning of the file so it
00201             // can be read.
00202             ifrewind(myFilePtr);
00203             myInterfacePtr = new SamInterface;
00204         }
00205     }
00206 
00207     // File is open for reading.
00208     myIsOpenForRead = true;
00209     // Successfully opened the file.
00210     myStatus = SamStatus::SUCCESS;
00211     return(true);
00212 }
00213 
00214 
00215 // Open a sam/bam file for writing with the specified filename.
00216 bool SamFile::OpenForWrite(const char * filename)
00217 {
00218     // Reset for any previously operated on files.
00219     resetFile();
00220     
00221     int lastchar = 0;
00222     while (filename[lastchar] != 0) lastchar++;   
00223     if (lastchar >= 4 && 
00224         filename[lastchar - 4] == 'u' &&
00225         filename[lastchar - 3] == 'b' &&
00226         filename[lastchar - 2] == 'a' &&
00227         filename[lastchar - 1] == 'm')
00228     {
00229         // BAM File.
00230         // if -.ubam is the filename, write uncompressed bam to stdout
00231         if((lastchar == 6) && (filename[0] == '-') && (filename[1] == '.'))
00232         {
00233             filename = "-";
00234         }
00235         myFilePtr = ifopen(filename, "wb", InputFile::UNCOMPRESSED);
00236 
00237         myInterfacePtr = new BamInterface;
00238     }
00239     else if (lastchar >= 3 && 
00240              filename[lastchar - 3] == 'b' &&
00241              filename[lastchar - 2] == 'a' &&
00242              filename[lastchar - 1] == 'm')
00243     {
00244         // BAM File.
00245         // if -.bam is the filename, write compressed bam to stdout
00246         if((lastchar == 5) && (filename[0] == '-') && (filename[1] == '.'))
00247         {
00248             filename = "-";
00249         }
00250         myFilePtr = ifopen(filename, "wb", InputFile::BGZF);
00251         
00252         myInterfacePtr = new BamInterface;
00253     }
00254     else
00255     {
00256         // SAM File
00257         // if - (followed by anything is the filename,
00258         // write uncompressed sam to stdout
00259         if((lastchar >= 1) && (filename[0] == '-'))
00260         {
00261             filename = "-";
00262         }
00263         myFilePtr = ifopen(filename, "wb", InputFile::UNCOMPRESSED);
00264    
00265         myInterfacePtr = new SamInterface;
00266     }
00267 
00268     if (myFilePtr == NULL)
00269     {
00270         std::string errorMessage = "Failed to Open ";
00271         errorMessage += filename;
00272         errorMessage += " for writing";
00273         myStatus.setStatus(SamStatus::FAIL_IO, errorMessage.c_str());
00274         return(false);
00275     }
00276    
00277     myIsOpenForWrite = true;
00278 
00279     // Successfully opened the file.
00280     myStatus = SamStatus::SUCCESS;
00281     return(true);
00282 }
00283 
00284 
00285 // Read BAM Index file.
00286 bool SamFile::ReadBamIndex(const char* bamIndexFilename)
00287 {
00288     // Cleanup a previously setup index.
00289     if(myBamIndex != NULL)
00290     {
00291         delete myBamIndex;
00292         myBamIndex = NULL;
00293     }
00294 
00295     // Create a new bam index.
00296     myBamIndex = new BamIndex();
00297     SamStatus::Status indexStat = myBamIndex->readIndex(bamIndexFilename);
00298 
00299     if(indexStat != SamStatus::SUCCESS)
00300     {
00301         std::string errorMessage = "Failed to read the bam Index file: ";
00302         errorMessage += bamIndexFilename;
00303         myStatus.setStatus(indexStat, errorMessage.c_str());
00304         delete myBamIndex;
00305         myBamIndex = NULL;
00306         return(false);
00307     }
00308     myStatus = SamStatus::SUCCESS;
00309     return(true);
00310 }
00311 
00312 
00313 // Close the file if there is one open.
00314 void SamFile::Close()
00315 {
00316     // Resetting the file will close it if it is open, and
00317     // will reset all other variables.
00318     resetFile();
00319 }
00320 
00321 
00322 // Returns whether or not the end of the file has been reached.
00323 // return: int - true = EOF; false = not eof.
00324 bool SamFile::IsEOF()
00325 {
00326     if (myFilePtr != NULL)
00327     {
00328         // File Pointer is set, so return if eof.
00329         return(ifeof(myFilePtr));
00330     }
00331     // File pointer is not set, so return true, eof.
00332     return true;
00333 }
00334 
00335 
00336 // Read the header from the currently opened file.
00337 bool SamFile::ReadHeader(SamFileHeader& header)
00338 {
00339     if(myIsOpenForRead == false)
00340     {
00341         // File is not open for read
00342         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00343                            "Cannot read header since the file is not open for reading");
00344         return(false);
00345     }
00346 
00347     if(myHasHeader == true)
00348     {
00349         // The header has already been read.
00350         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00351                            "Cannot read header since it has already been read.");
00352         return(false);
00353     }
00354 
00355     myStatus = myInterfacePtr->readHeader(myFilePtr, header);
00356     if(myStatus == SamStatus::SUCCESS)
00357     {
00358         // The header has now been successfully read.
00359         myHasHeader = true;
00360         return(true);
00361     }
00362     return(false);
00363 }
00364 
00365 
00366 // Write the header to the currently opened file.
00367 bool SamFile::WriteHeader(SamFileHeader& header)
00368 {
00369     if(myIsOpenForWrite == false)
00370     {
00371         // File is not open for write
00372         // -OR-
00373         // The header has already been written.
00374         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00375                            "Cannot write header since the file is not open for writing");
00376         return(false);
00377     }
00378 
00379     if(myHasHeader == true)
00380     {
00381         // The header has already been written.
00382         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00383                            "Cannot write header since it has already been written");
00384         return(false);
00385     }
00386 
00387     myStatus = myInterfacePtr->writeHeader(myFilePtr, header);
00388     if(myStatus == SamStatus::SUCCESS)
00389     {
00390         // The header has now been successfully written.
00391         myHasHeader = true;
00392         return(true);
00393     }
00394 
00395     // return the status.
00396     return(false);
00397 }
00398 
00399 
00400 // Read a record from the currently opened file.
00401 bool SamFile::ReadRecord(SamFileHeader& header, 
00402                          SamRecord& record)
00403 {
00404     myStatus = SamStatus::SUCCESS;
00405 
00406     if(myIsOpenForRead == false)
00407     {
00408         // File is not open for read
00409         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00410                            "Cannot read record since the file is not open for reading");
00411         throw(std::runtime_error("SOFTWARE BUG: trying to read a SAM/BAM record prior to opening the file."));
00412         return(false);
00413     }
00414 
00415     if(myHasHeader == false)
00416     {
00417         // The header has not yet been read.
00418         // TODO - maybe just read the header.
00419         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00420                            "Cannot read record since the header has not been read.");
00421         throw(std::runtime_error("SOFTWARE BUG: trying to read a SAM/BAM record prior to reading the header."));
00422         return(false);
00423     }
00424 
00425     // Check to see if a new region has been set.  If so, determine the
00426     // chunks for that region.
00427     if(myNewSection)
00428     {
00429         if(!processNewSection(header))
00430         {
00431             // Failed processing a new section.  Could be an 
00432             // order issue like the file not being open or the
00433             // indexed file not having been read.
00434             // processNewSection sets myStatus with the failure reason.
00435             return(false);
00436         }
00437     }
00438 
00439     // Check to see if the file should be read by index.
00440     if(myRefID != BamIndex::REF_ID_ALL)
00441     {
00442         // Reference ID is set, so read by index.
00443         return(readIndexedRecord(header, record));
00444     }
00445 
00446     // File is open for reading and the header has been read, so read the next
00447     // record.
00448     myInterfacePtr->readRecord(myFilePtr, header, record, myStatus);
00449     if(myStatus == SamStatus::SUCCESS)
00450     {
00451         // A record was successfully read, so increment the record count.
00452         myRecordCount++;
00453 
00454         if(myStatistics != NULL)
00455         {
00456             // Statistics should be updated.
00457             myStatistics->updateStatistics(record);
00458         }
00459 
00460         // Successfully read the record, so check the sort order.
00461         if(!validateSortOrder(record, header))
00462         {
00463             // ValidateSortOrder sets the status on a failure.
00464             return(false);
00465         }
00466         return(true);
00467     }
00468     // Failed to read the record.
00469     return(false);
00470 }
00471 
00472 
00473 
00474 // Write a record to the currently opened file.
00475 bool SamFile::WriteRecord(SamFileHeader& header, 
00476                           SamRecord& record)
00477 {
00478     if(myIsOpenForWrite == false)
00479     {
00480         // File is not open for writing
00481         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00482                            "Cannot write record since the file is not open for writing");
00483         return(false);
00484     }
00485 
00486     if(myHasHeader == false)
00487     {
00488         // The header has not yet been written.
00489         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00490                            "Cannot write record since the header has not been written");
00491         return(false);
00492     }
00493 
00494     // Before trying to write the record, validate the sort order.
00495     if(!validateSortOrder(record, header))
00496     {
00497         // Not sorted like it is supposed to be, do not write the record
00498         myStatus.setStatus(SamStatus::INVALID_SORT, 
00499                            "Cannot write the record since the file is not properly sorted.");
00500         return(false);
00501     }
00502 
00503     // File is open for writing and the header has been written, so write the
00504     // record.
00505     myStatus = myInterfacePtr->writeRecord(myFilePtr, header, record);
00506 
00507     if(myStatus == SamStatus::SUCCESS)
00508     {
00509         // A record was successfully written, so increment the record count.
00510         myRecordCount++;
00511         return(true);
00512     }
00513     return(false);
00514 }
00515 
00516 
00517 // Set the flag to validate that the file is sorted as it is read/written.
00518 // Must be called after the file has been opened.
00519 void SamFile::setSortedValidation(SortedType sortType)
00520 {
00521     mySortedType = sortType;
00522 }
00523 
00524 
00525 // Return the number of records that have been read/written so far.
00526 uint32_t SamFile::GetCurrentRecordCount()
00527 {
00528     return(myRecordCount);
00529 }
00530 
00531 
00532 // Sets what part of the SamFile should be read.
00533 bool SamFile::SetReadSection(int32_t refID)
00534 {
00535     // No start/end specified, so set back to default -1.
00536     return(SetReadSection(refID, -1, -1));
00537 }
00538 
00539 
00540 
00541 // Sets what part of the SamFile should be read.
00542 bool SamFile::SetReadSection(const char* refName)
00543 {
00544     // No start/end specified, so set back to default -1.
00545     return(SetReadSection(refName, -1, -1));
00546 }
00547 
00548 
00549 // Sets what part of the BAM file should be read.
00550 bool SamFile::SetReadSection(int32_t refID, int32_t start, int32_t end)
00551 {
00552     // If there is not a BAM file open for reading, return failure.
00553     // Opening a new file clears the read section, so it must be
00554     // set after the file is opened.
00555     if(!myIsBamOpenForRead)
00556     {
00557         // There is not a BAM file open for reading.
00558         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00559                            "Canot set section since there is no bam file open");
00560         return(false);
00561     }
00562 
00563     myNewSection = true;
00564     myStartPos = start;
00565     myEndPos = end;
00566     myRefID = refID;
00567     myRefName.clear();
00568     myChunksToRead.clear();
00569     // Reset the end of the current chunk.  We are resetting our read, so
00570     // we no longer have a "current chunk" that we are reading.
00571     myCurrentChunkEnd = 0;
00572     myStatus = SamStatus::SUCCESS;
00573     
00574     return(true);
00575 }
00576 
00577 
00578 // Sets what part of the BAM file should be read.
00579 bool SamFile::SetReadSection(const char* refName, int32_t start, int32_t end)
00580 {
00581     // If there is not a BAM file open for reading, return failure.
00582     // Opening a new file clears the read section, so it must be
00583     // set after the file is opened.
00584     if(!myIsBamOpenForRead)
00585     {
00586         // There is not a BAM file open for reading.
00587         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00588                            "Canot set section since there is no bam file open");
00589         return(false);
00590     }
00591 
00592     myNewSection = true;
00593     myStartPos = start;
00594     myEndPos = end;
00595     if((strcmp(refName, "") == 0) || (strcmp(refName, "*") == 0))
00596     {
00597         // No Reference name specified, so read just the "-1" entries.
00598         myRefID = BamIndex::REF_ID_UNMAPPED;
00599     }
00600     else
00601     {
00602         // save the reference name and revert the reference ID to unknown
00603         // so it will be calculated later.
00604         myRefName = refName;
00605         myRefID = BamIndex::REF_ID_ALL;
00606     }
00607     myChunksToRead.clear();
00608     // Reset the end of the current chunk.  We are resetting our read, so
00609     // we no longer have a "current chunk" that we are reading.
00610     myCurrentChunkEnd = 0;
00611     myStatus = SamStatus::SUCCESS;
00612     
00613     return(true);
00614 }
00615 
00616 
00617 // Returns the number of bases in the passed in read that overlap the
00618 // region that is currently set.
00619 uint32_t SamFile::GetNumOverlaps(SamRecord& samRecord)
00620 {
00621     // Get the overlaps in the sam record for the region currently set
00622     // for this file.
00623     return(samRecord.getNumOverlaps(myStartPos, myEndPos));
00624 }
00625 
00626 
00627 void SamFile::GenerateStatistics(bool genStats)
00628 {
00629     if(genStats)
00630     {
00631         if(myStatistics == NULL)
00632         {
00633             // Want to generate statistics, but do not yet have the
00634             // structure for them, so create one.
00635             myStatistics = new SamStatistics();
00636         }
00637     }
00638     else
00639     {
00640         // Do not generate statistics, so if myStatistics is not NULL, 
00641         // delete it.
00642         if(myStatistics != NULL)
00643         {
00644             delete myStatistics;
00645             myStatistics = NULL;
00646         }
00647     }
00648 
00649 }
00650 
00651 
00652 // Reset variables for each file.
00653 void SamFile::resetFile()
00654 {
00655     // Close the file.
00656     if (myFilePtr != NULL)
00657     {
00658         // If we already have an open file, close it.
00659         ifclose(myFilePtr);
00660         myFilePtr = NULL;
00661     }
00662     if(myInterfacePtr != NULL)
00663     {
00664         delete myInterfacePtr;
00665         myInterfacePtr = NULL;
00666     }
00667 
00668     myIsOpenForRead = false;
00669     myIsOpenForWrite = false;
00670     myHasHeader = false;
00671     mySortedType = UNSORTED;
00672     myPrevReadName = "";
00673     myPrevCoord = -1;
00674     myPrevRefID = 0;
00675     myRecordCount = 0;
00676     myStatus = SamStatus::SUCCESS;
00677 
00678     // Reset indexed bam values.
00679     myIsBamOpenForRead = false;
00680     myRefID = BamIndex::REF_ID_ALL;
00681     myStartPos = -1;
00682     myEndPos = -1;
00683     myNewSection = false;
00684     myCurrentChunkEnd = 0;
00685     myChunksToRead.clear();
00686     if(myBamIndex != NULL)
00687     {
00688         delete myBamIndex;
00689         myBamIndex = NULL;
00690     }
00691 
00692     // If statistics are being generated, reset them.
00693     if(myStatistics != NULL)
00694     {
00695         myStatistics->reset();
00696     }
00697 
00698     myRefName.clear();
00699 }
00700 
00701 
00702 // Validate that the record is sorted compared to the previously read record
00703 // if there is one, according to the specified sort order.
00704 // If the sort order is UNSORTED, true is returned.
00705 bool SamFile::validateSortOrder(SamRecord& record, SamFileHeader& header)
00706 {
00707     bool status = false;
00708     if(mySortedType == UNSORTED)
00709     {
00710         // Unsorted, so nothing to validate, just return true.
00711         status = true;
00712     }
00713     else 
00714     {
00715         // Check to see if mySortedType is based on the header.
00716         if(mySortedType == FLAG)
00717         {
00718             // Determine the sorted type from what was read out of the header.
00719             mySortedType = getSortOrderFromHeader(header);
00720         }
00721 
00722         if(mySortedType == QUERY_NAME)
00723         {
00724             // Validate that it is sorted by query name.
00725             // Get the query name from the record.
00726             const char* readName = record.getReadName();
00727             if(myPrevReadName.compare(readName) > 0)
00728             {
00729                 // The previous name is greater than the new record's name, so
00730                 // return false.
00731                 String errorMessage = "ERROR: File is not sorted at record ";
00732                 errorMessage += myRecordCount;
00733                 myStatus.setStatus(SamStatus::INVALID_SORT, 
00734                                    errorMessage.c_str());
00735                 status = false;
00736             }
00737             else
00738             {
00739                 myPrevReadName = readName;
00740                 status = true;
00741             }
00742         }
00743         else 
00744         {
00745             // Validate that it is sorted by COORDINATES.
00746             // Get the leftmost coordinate and the reference index.
00747             int32_t refID = record.getReferenceID();
00748             int32_t coord = record.get0BasedPosition();
00749             // The unmapped reference id is at the end of a sorted file.
00750             if(refID == BamIndex::REF_ID_UNMAPPED)
00751             {
00752                 // A new reference ID that is for the unmapped reads
00753                 // is always valid.
00754                 status = true;
00755                 myPrevRefID = refID;
00756                 myPrevCoord = coord;
00757             }
00758             else if(myPrevRefID == BamIndex::REF_ID_UNMAPPED)
00759             {
00760                 // Previous reference ID was for unmapped reads, but the
00761                 // current one is not, so this is not sorted.
00762                 String errorMessage = "ERROR: File is not sorted at record ";
00763                 errorMessage += myRecordCount;
00764                 myStatus.setStatus(SamStatus::INVALID_SORT, 
00765                                    errorMessage.c_str());
00766                 status = false;
00767             }
00768             else if(refID < myPrevRefID)
00769             {
00770                 // Current reference id is less than the previous one, 
00771                 //meaning that it is not sorted.
00772                 String errorMessage = "ERROR: File is not sorted at record ";
00773                 errorMessage += myRecordCount;
00774                 myStatus.setStatus(SamStatus::INVALID_SORT, 
00775                                    errorMessage.c_str());
00776                 status = false;
00777             }
00778             else
00779             {
00780                 // The reference IDs are in the correct order.
00781                 if(refID > myPrevRefID)
00782                 {
00783                     // New reference id, so set the previous coordinate to -1
00784                     myPrevCoord = -1;
00785                 }
00786             
00787                 // Check the coordinates.
00788                 if(coord < myPrevCoord)
00789                 {
00790                     // New Coord is less than the previous position.
00791                     String errorMessage = "ERROR: File is not sorted at record ";
00792                     errorMessage += myRecordCount;
00793                     myStatus.setStatus(SamStatus::INVALID_SORT, 
00794                                        errorMessage.c_str());
00795                     status = false;
00796                 }
00797                 else
00798                 {
00799                     myPrevRefID = refID;
00800                     myPrevCoord = coord;
00801                     status = true;
00802                 }
00803             }
00804         }
00805     }
00806 
00807     return(status);
00808 }
00809 
00810 
00811 SamFile::SortedType SamFile::getSortOrderFromHeader(SamFileHeader& header)
00812 {
00813     const char* tag = header.getSortOrder();
00814    
00815     // Default to unsorted since if it is not specified in the header
00816     // that is the value that should be used.
00817     SortedType headerSortOrder = UNSORTED;
00818     if(strcmp(tag, "queryname") == 0)
00819     {
00820         headerSortOrder = QUERY_NAME;
00821     }
00822     else if(strcmp(tag, "coordinate") == 0)
00823     {
00824         headerSortOrder = COORDINATE;
00825     }
00826     return(headerSortOrder);
00827 }
00828 
00829 
00830 // Read a record from the currently opened file based on the indexing info.
00831 bool SamFile::readIndexedRecord(SamFileHeader& header, 
00832                                 SamRecord& record)
00833 {
00834     // A section to read is set and the file is open for reading, as verified
00835     // prior to calling into this protected method.
00836     bool recordFound = false;
00837     while(!recordFound)
00838     {
00839         // Check to see if we have more to read out of the current chunk.
00840         // By checking the current position in relation to the current
00841         // end chunk.  If the current position is >= the end of the
00842         // current chunk, then we must see to the next chunk.
00843         uint64_t currentPos = iftell(myFilePtr);
00844         if(currentPos >= myCurrentChunkEnd)
00845         {
00846             // If there are no more chunks to process, return failure.
00847             if(myChunksToRead.empty())
00848             {
00849                 myStatus = SamStatus::NO_MORE_RECS;
00850                 return(false);
00851             }
00852       
00853             // There are more chunks left, so get the next chunk.
00854             Chunk newChunk = myChunksToRead.pop();
00855 
00856             // Move to the location of the new chunk if it is not adjacent
00857             // to the current chunk.
00858             if(newChunk.chunk_beg != currentPos)
00859             {
00860                 // New chunk is not adjacent, so move to it.
00861                 if(ifseek(myFilePtr, newChunk.chunk_beg, SEEK_SET) != true)
00862                 {
00863                     // seek failed, cannot retrieve next record, return failure.
00864                     myStatus.setStatus(SamStatus::FAIL_IO, 
00865                                        "Failed to seek to the next record");
00866                     return(false);
00867                 }
00868             }
00869             // Seek succeeded, set the end of the new chunk.
00870             myCurrentChunkEnd = newChunk.chunk_end;
00871         }
00872 
00873         // At the position of the next record.  Read the record.
00874         myInterfacePtr->readRecord(myFilePtr, header, record, myStatus);
00875         if(myStatus != SamStatus::SUCCESS)
00876         {
00877             // Reading the record failed, return failure.
00878             return(false);
00879         }
00880 
00881         // Successfully read the record.  Check to see if it is in the correct 
00882         // reference/position.
00883         if(record.getReferenceID() != myRefID)
00884         {
00885             // Incorrect reference ID, return failure.
00886             myStatus = SamStatus::NO_MORE_RECS;
00887             return(false);
00888         }
00889    
00890         // Found a record.
00891         recordFound = true;
00892 
00893         // If start/end position are set, verify that the alignment falls
00894         // completely within those.
00895         if((myStartPos != -1) && (myEndPos != -1))
00896         {
00897             // Start & end are specified, so check if the alignment 
00898             // overlaps this section.
00899 
00900             // If the alignment start is greater than the end of the region,
00901             // return NO_MORE_RECS.
00902             if(record.get0BasedPosition() >= myEndPos)
00903             {
00904                 myStatus = SamStatus::NO_MORE_RECS;
00905                 return(false);
00906             }
00907 
00908             // We know the start is less than the end position, so the alignment
00909             // overlaps the region if the alignment end position is greater than the
00910             // start of the region.
00911             if(record.get0BasedAlignmentEnd() < myStartPos)
00912             {
00913                 // If it does not overlap the region, so go to the next
00914                 // record...set recordFound back to false.
00915                 recordFound = false;
00916             }
00917         }
00918     }
00919 
00920     // Successfully read the record, so check the sort order.
00921     if(!validateSortOrder(record, header))
00922     {
00923         myStatus.setStatus(SamStatus::INVALID_SORT, 
00924                            "The file is not properly sorted.");
00925         return(false);
00926     }
00927    
00928     // If it made it here, then the record was successfully read
00929     // and met the criteria, so return success.
00930     myRecordCount++;
00931 
00932     if(myStatistics != NULL)
00933     {
00934         // Successfully read, statistics should be updated.
00935         myStatistics->updateStatistics(record);
00936     }
00937     
00938     return(true);
00939 }
00940 
00941 
00942 bool SamFile::processNewSection(SamFileHeader &header)
00943 {
00944     myNewSection = false;
00945 
00946     // If there is no index file, return failure.
00947     if(myBamIndex == NULL)
00948     {
00949         // No bam index has been read.
00950         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00951                            "Canot read section since there is no index file open");
00952         throw(std::runtime_error("SOFTWARE BUG: trying to read a BAM record by section prior to opening the BAM Index file."));
00953         return(false);
00954     }
00955 
00956     // If there is not a BAM file open for reading, return failure.
00957     if(!myIsBamOpenForRead)
00958     {
00959         // There is not a BAM file open for reading.
00960         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00961                            "Canot read section since there is no bam file open");
00962         throw(std::runtime_error("SOFTWARE BUG: trying to read a BAM record by section without opening a BAM file."));
00963         return(false);
00964     }
00965 
00966     if(myHasHeader == false)
00967     {
00968         // The header has not yet been read.
00969         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00970                            "Cannot read record since the header has not been read.");
00971         throw(std::runtime_error("SOFTWARE BUG: trying to read a BAM record by section prior to opening the header."));
00972         return(false);
00973     }
00974 
00975     myChunksToRead.clear();
00976     // Reset the end of the current chunk.  We are resetting our read, so
00977     // we no longer have a "current chunk" that we are reading.
00978     myCurrentChunkEnd = 0;
00979 
00980     // Check to see if the read section was set based on the reference name
00981     // but not yet converted to reference id.
00982     if(!myRefName.empty())
00983     {
00984         myRefID = header.getReferenceID(myRefName.c_str());
00985         // Clear the myRefName length so this code is only executed once.
00986         myRefName.clear();
00987     }
00988 
00989     // Get the chunks associated with this reference region.
00990     if(myBamIndex->getChunksForRegion(myRefID, myStartPos, myEndPos, 
00991                                       myChunksToRead) == true)
00992     {
00993         myStatus = SamStatus::SUCCESS;
00994     }
00995     else
00996     {
00997         myStatus.setStatus(SamStatus::FAIL_PARSE, 
00998                            "Failed to get the specified region.");
00999     }
01000     return(true);
01001 }
01002 
01003 
01004 // Default Constructor.
01005 SamFileReader::SamFileReader()
01006 {
01007 }
01008 
01009 
01010 // Constructor that opens the specified file for read.
01011 SamFileReader::SamFileReader(const char* filename)
01012 {
01013     if(!OpenForRead(filename))
01014     {
01015         // Failed to open for reading - print error and abort.
01016         fprintf(stderr, "%s\n", GetStatusMessage());
01017         std::cerr << "FAILURE - EXITING!!!" << std::endl;
01018         exit(-1);
01019     }
01020 }
01021 
01022 
01023 SamFileReader::~SamFileReader()
01024 {
01025 }
01026 
01027 
01028 // Default Constructor.
01029 SamFileWriter::SamFileWriter()
01030 {
01031 }
01032 
01033 
01034 // Constructor that opens the specified file for write.
01035 SamFileWriter::SamFileWriter(const char* filename)
01036 {
01037     if(!OpenForWrite(filename))
01038     {
01039         // Failed to open for reading - print error and abort.
01040         fprintf(stderr, "%s\n", GetStatusMessage());
01041         std::cerr << "FAILURE - EXITING!!!" << std::endl;
01042         exit(-1);
01043     }
01044 }
01045 
01046 
01047 SamFileWriter::~SamFileWriter()
01048 {
01049 }
Generated on Wed Nov 17 15:38:27 2010 for StatGen Software by  doxygen 1.6.3