SamFile.cpp

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 #include <stdexcept>
00018 #include "SamFile.h"
00019 #include "SamFileHeader.h"
00020 #include "SamRecord.h"
00021 #include "BamInterface.h"
00022 #include "SamInterface.h"
00023 
00024 // Constructor, init variables.
00025 SamFile::SamFile()
00026     : myFilePtr(NULL),
00027       myInterfacePtr(NULL),
00028       myStatistics(NULL),
00029       myStatus(),
00030       myBamIndex(NULL),
00031       myRefPtr(NULL),
00032       myReadTranslation(SamRecord::NONE),
00033       myWriteTranslation(SamRecord::NONE)
00034 {
00035     resetFile();
00036 }
00037 
00038 
00039 // Constructor, init variables.
00040 SamFile::SamFile(ErrorHandler::HandlingType errorHandlingType)
00041     : myFilePtr(NULL),
00042       myInterfacePtr(NULL),
00043       myStatistics(NULL),
00044       myStatus(errorHandlingType),
00045       myBamIndex(NULL),
00046       myRefPtr(NULL),
00047       myReadTranslation(SamRecord::NONE),
00048       myWriteTranslation(SamRecord::NONE)
00049 {
00050     resetFile();
00051 }
00052 
00053 
00054 // Constructor, init variables and open the specified file based on the
00055 // specified mode (READ/WRITE).
00056 SamFile::SamFile(const char* filename, OpenType mode)
00057     : myFilePtr(NULL),
00058       myInterfacePtr(NULL),
00059       myStatistics(NULL),
00060       myStatus(),
00061       myBamIndex(NULL),
00062       myRefPtr(NULL),
00063       myReadTranslation(SamRecord::NONE),
00064       myWriteTranslation(SamRecord::NONE)
00065 {
00066     resetFile();
00067 
00068     bool openStatus = true;
00069     if(mode == READ)
00070     {
00071         // open the file for read.
00072         openStatus = OpenForRead(filename);
00073     }
00074     else
00075     {
00076         // open the file for write.
00077         openStatus = OpenForWrite(filename);
00078     }
00079     if(!openStatus)
00080     {
00081         // Failed to open the file - print error and abort.
00082         fprintf(stderr, "%s\n", GetStatusMessage());
00083         std::cerr << "FAILURE - EXITING!!!" << std::endl;
00084         exit(-1);
00085     }
00086 }
00087 
00088 
00089 // Constructor, init variables and open the specified file based on the
00090 // specified mode (READ/WRITE).  Default is READ..
00091 SamFile::SamFile(const char* filename, OpenType mode,
00092                  ErrorHandler::HandlingType errorHandlingType)
00093     : myFilePtr(NULL),
00094       myInterfacePtr(NULL),
00095       myStatistics(NULL),
00096       myStatus(errorHandlingType),
00097       myBamIndex(NULL),
00098       myRefPtr(NULL),
00099       myReadTranslation(SamRecord::NONE),
00100       myWriteTranslation(SamRecord::NONE)
00101 {
00102     resetFile();
00103 
00104     bool openStatus = true;
00105     if(mode == READ)
00106     {
00107         // open the file for read.
00108         openStatus = OpenForRead(filename);
00109     }
00110     else
00111     {
00112         // open the file for write.
00113         openStatus = OpenForWrite(filename);
00114     }
00115     if(!openStatus)
00116     {
00117         // Failed to open the file - print error and abort.
00118         fprintf(stderr, "%s\n", GetStatusMessage());
00119         std::cerr << "FAILURE - EXITING!!!" << std::endl;
00120         exit(-1);
00121     }
00122 }
00123 
00124 
00125 SamFile::~SamFile()
00126 {
00127     resetFile();
00128     if(myStatistics != NULL)
00129     {
00130         delete myStatistics;
00131     }
00132 }
00133 
00134 
00135 // Open a sam/bam file for reading with the specified filename.
00136 bool SamFile::OpenForRead(const char * filename)
00137 {
00138     // Reset for any previously operated on files.
00139     resetFile();
00140 
00141     int lastchar = 0;
00142 
00143     while (filename[lastchar] != 0) lastchar++;
00144 
00145     // If at least one character, check for '-'.
00146     if((lastchar >= 1) && (filename[0] == '-'))
00147     {
00148         // Read from stdin - determine type of file to read.
00149         // Determine if compressed bam.
00150         if(strcmp(filename, "-.bam") == 0)
00151         {
00152             // Compressed bam - open as bgzf.
00153             // -.bam is the filename, read compressed bam from stdin
00154             filename = "-";
00155             myFilePtr = ifopen(filename, "rb", InputFile::BGZF);
00156             
00157             myInterfacePtr = new BamInterface;
00158 
00159             // Read the magic string.
00160             char magic[4];
00161             ifread(myFilePtr, magic, 4);
00162         }
00163         else if(strcmp(filename, "-.ubam") == 0)
00164         {
00165             // uncompressed BAM File.
00166             // -.ubam is the filename, read uncompressed bam from stdin
00167             filename = "-";
00168             myFilePtr = ifopen(filename, "rb", InputFile::UNCOMPRESSED);
00169         
00170             myInterfacePtr = new BamInterface;
00171 
00172             // Read the magic string.
00173             char magic[4];
00174             ifread(myFilePtr, magic, 4);
00175         }
00176         else
00177         {
00178             // SAM File.
00179             // read sam from stdin
00180             filename = "-";
00181             myFilePtr = ifopen(filename, "rb", InputFile::UNCOMPRESSED);
00182             myInterfacePtr = new SamInterface;
00183         }
00184     }
00185     else
00186     {
00187         // Not from stdin.  Read the file to determine the type.
00188         myFilePtr = ifopen(filename, "rb");
00189         
00190         if (myFilePtr == NULL)
00191         {
00192             std::string errorMessage = "Failed to Open ";
00193             errorMessage += filename;
00194             errorMessage += " for reading";
00195             myStatus.setStatus(SamStatus::FAIL_IO, errorMessage.c_str());
00196             return(false);
00197         }
00198         
00199         char magic[4];
00200         ifread(myFilePtr, magic, 4);
00201         
00202         if (magic[0] == 'B' && magic[1] == 'A' && magic[2] == 'M' &&
00203             magic[3] == 1)
00204         {
00205             myInterfacePtr = new BamInterface;
00206             // Set that it is a bam file open for reading.  This is needed to
00207             // determine if an index file can be used.
00208             myIsBamOpenForRead = true;
00209         }
00210         else
00211         {
00212             // Not a bam, so rewind to the beginning of the file so it
00213             // can be read.
00214             ifrewind(myFilePtr);
00215             myInterfacePtr = new SamInterface;
00216         }
00217     }
00218 
00219     // File is open for reading.
00220     myIsOpenForRead = true;
00221     // Successfully opened the file.
00222     myStatus = SamStatus::SUCCESS;
00223     return(true);
00224 }
00225 
00226 
00227 // Open a sam/bam file for writing with the specified filename.
00228 bool SamFile::OpenForWrite(const char * filename)
00229 {
00230     // Reset for any previously operated on files.
00231     resetFile();
00232     
00233     int lastchar = 0;
00234     while (filename[lastchar] != 0) lastchar++;   
00235     if (lastchar >= 4 && 
00236         filename[lastchar - 4] == 'u' &&
00237         filename[lastchar - 3] == 'b' &&
00238         filename[lastchar - 2] == 'a' &&
00239         filename[lastchar - 1] == 'm')
00240     {
00241         // BAM File.
00242         // if -.ubam is the filename, write uncompressed bam to stdout
00243         if((lastchar == 6) && (filename[0] == '-') && (filename[1] == '.'))
00244         {
00245             filename = "-";
00246         }
00247         myFilePtr = ifopen(filename, "wb", InputFile::UNCOMPRESSED);
00248 
00249         myInterfacePtr = new BamInterface;
00250     }
00251     else if (lastchar >= 3 && 
00252              filename[lastchar - 3] == 'b' &&
00253              filename[lastchar - 2] == 'a' &&
00254              filename[lastchar - 1] == 'm')
00255     {
00256         // BAM File.
00257         // if -.bam is the filename, write compressed bam to stdout
00258         if((lastchar == 5) && (filename[0] == '-') && (filename[1] == '.'))
00259         {
00260             filename = "-";
00261         }
00262         myFilePtr = ifopen(filename, "wb", InputFile::BGZF);
00263         
00264         myInterfacePtr = new BamInterface;
00265     }
00266     else
00267     {
00268         // SAM File
00269         // if - (followed by anything is the filename,
00270         // write uncompressed sam to stdout
00271         if((lastchar >= 1) && (filename[0] == '-'))
00272         {
00273             filename = "-";
00274         }
00275         myFilePtr = ifopen(filename, "wb", InputFile::UNCOMPRESSED);
00276    
00277         myInterfacePtr = new SamInterface;
00278     }
00279 
00280     if (myFilePtr == NULL)
00281     {
00282         std::string errorMessage = "Failed to Open ";
00283         errorMessage += filename;
00284         errorMessage += " for writing";
00285         myStatus.setStatus(SamStatus::FAIL_IO, errorMessage.c_str());
00286         return(false);
00287     }
00288    
00289     myIsOpenForWrite = true;
00290 
00291     // Successfully opened the file.
00292     myStatus = SamStatus::SUCCESS;
00293     return(true);
00294 }
00295 
00296 
00297 // Read BAM Index file.
00298 bool SamFile::ReadBamIndex(const char* bamIndexFilename)
00299 {
00300     // Cleanup a previously setup index.
00301     if(myBamIndex != NULL)
00302     {
00303         delete myBamIndex;
00304         myBamIndex = NULL;
00305     }
00306 
00307     // Create a new bam index.
00308     myBamIndex = new BamIndex();
00309     SamStatus::Status indexStat = myBamIndex->readIndex(bamIndexFilename);
00310 
00311     if(indexStat != SamStatus::SUCCESS)
00312     {
00313         std::string errorMessage = "Failed to read the bam Index file: ";
00314         errorMessage += bamIndexFilename;
00315         myStatus.setStatus(indexStat, errorMessage.c_str());
00316         delete myBamIndex;
00317         myBamIndex = NULL;
00318         return(false);
00319     }
00320     myStatus = SamStatus::SUCCESS;
00321     return(true);
00322 }
00323 
00324 
00325 // Sets the reference to the specified genome sequence object.
00326 void SamFile::SetReference(GenomeSequence* reference)
00327 {
00328     myRefPtr = reference;
00329 }
00330 
00331 
00332 // Set the type of sequence translation to use when reading the sequence.
00333 void SamFile::SetReadSequenceTranslation(SamRecord::SequenceTranslation translation)
00334 {
00335     myReadTranslation = translation;
00336 }
00337 
00338 
00339 // Set the type of sequence translation to use when writing the sequence.
00340 void SamFile::SetWriteSequenceTranslation(SamRecord::SequenceTranslation translation)
00341 {
00342     myWriteTranslation = translation;
00343 }
00344 
00345 // Close the file if there is one open.
00346 void SamFile::Close()
00347 {
00348     // Resetting the file will close it if it is open, and
00349     // will reset all other variables.
00350     resetFile();
00351 }
00352 
00353 
00354 // Returns whether or not the end of the file has been reached.
00355 // return: int - true = EOF; false = not eof.
00356 bool SamFile::IsEOF()
00357 {
00358     if (myFilePtr != NULL)
00359     {
00360         // File Pointer is set, so return if eof.
00361         return(ifeof(myFilePtr));
00362     }
00363     // File pointer is not set, so return true, eof.
00364     return true;
00365 }
00366 
00367 
00368 // Read the header from the currently opened file.
00369 bool SamFile::ReadHeader(SamFileHeader& header)
00370 {
00371     if(myIsOpenForRead == false)
00372     {
00373         // File is not open for read
00374         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00375                            "Cannot read header since the file is not open for reading");
00376         return(false);
00377     }
00378 
00379     if(myHasHeader == true)
00380     {
00381         // The header has already been read.
00382         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00383                            "Cannot read header since it has already been read.");
00384         return(false);
00385     }
00386 
00387     myStatus = myInterfacePtr->readHeader(myFilePtr, header);
00388     if(myStatus == SamStatus::SUCCESS)
00389     {
00390         // The header has now been successfully read.
00391         myHasHeader = true;
00392         return(true);
00393     }
00394     return(false);
00395 }
00396 
00397 
00398 // Write the header to the currently opened file.
00399 bool SamFile::WriteHeader(SamFileHeader& header)
00400 {
00401     if(myIsOpenForWrite == false)
00402     {
00403         // File is not open for write
00404         // -OR-
00405         // The header has already been written.
00406         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00407                            "Cannot write header since the file is not open for writing");
00408         return(false);
00409     }
00410 
00411     if(myHasHeader == true)
00412     {
00413         // The header has already been written.
00414         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00415                            "Cannot write header since it has already been written");
00416         return(false);
00417     }
00418 
00419     myStatus = myInterfacePtr->writeHeader(myFilePtr, header);
00420     if(myStatus == SamStatus::SUCCESS)
00421     {
00422         // The header has now been successfully written.
00423         myHasHeader = true;
00424         return(true);
00425     }
00426 
00427     // return the status.
00428     return(false);
00429 }
00430 
00431 
00432 // Read a record from the currently opened file.
00433 bool SamFile::ReadRecord(SamFileHeader& header, 
00434                          SamRecord& record)
00435 {
00436     myStatus = SamStatus::SUCCESS;
00437 
00438     if(myIsOpenForRead == false)
00439     {
00440         // File is not open for read
00441         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00442                            "Cannot read record since the file is not open for reading");
00443         throw(std::runtime_error("SOFTWARE BUG: trying to read a SAM/BAM record prior to opening the file."));
00444         return(false);
00445     }
00446 
00447     if(myHasHeader == false)
00448     {
00449         // The header has not yet been read.
00450         // TODO - maybe just read the header.
00451         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00452                            "Cannot read record since the header has not been read.");
00453         throw(std::runtime_error("SOFTWARE BUG: trying to read a SAM/BAM record prior to reading the header."));
00454         return(false);
00455     }
00456 
00457     // Check to see if a new region has been set.  If so, determine the
00458     // chunks for that region.
00459     if(myNewSection)
00460     {
00461         if(!processNewSection(header))
00462         {
00463             // Failed processing a new section.  Could be an 
00464             // order issue like the file not being open or the
00465             // indexed file not having been read.
00466             // processNewSection sets myStatus with the failure reason.
00467             return(false);
00468         }
00469     }
00470 
00471     // Check to see if the file should be read by index.
00472     if(myRefID != BamIndex::REF_ID_ALL)
00473     {
00474         // Reference ID is set, so read by index.
00475         return(readIndexedRecord(header, record));
00476     }
00477 
00478     record.setReference(myRefPtr);
00479     record.setSequenceTranslation(myReadTranslation);
00480 
00481     // File is open for reading and the header has been read, so read the next
00482     // record.
00483     myInterfacePtr->readRecord(myFilePtr, header, record, myStatus);
00484     if(myStatus == SamStatus::SUCCESS)
00485     {
00486         // A record was successfully read, so increment the record count.
00487         myRecordCount++;
00488 
00489         if(myStatistics != NULL)
00490         {
00491             // Statistics should be updated.
00492             myStatistics->updateStatistics(record);
00493         }
00494 
00495         // Successfully read the record, so check the sort order.
00496         if(!validateSortOrder(record, header))
00497         {
00498             // ValidateSortOrder sets the status on a failure.
00499             return(false);
00500         }
00501         return(true);
00502     }
00503     // Failed to read the record.
00504     return(false);
00505 }
00506 
00507 
00508 
00509 // Write a record to the currently opened file.
00510 bool SamFile::WriteRecord(SamFileHeader& header, 
00511                           SamRecord& record)
00512 {
00513     if(myIsOpenForWrite == false)
00514     {
00515         // File is not open for writing
00516         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00517                            "Cannot write record since the file is not open for writing");
00518         return(false);
00519     }
00520 
00521     if(myHasHeader == false)
00522     {
00523         // The header has not yet been written.
00524         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00525                            "Cannot write record since the header has not been written");
00526         return(false);
00527     }
00528 
00529     // Before trying to write the record, validate the sort order.
00530     if(!validateSortOrder(record, header))
00531     {
00532         // Not sorted like it is supposed to be, do not write the record
00533         myStatus.setStatus(SamStatus::INVALID_SORT, 
00534                            "Cannot write the record since the file is not properly sorted.");
00535         return(false);
00536     }
00537 
00538     if(myRefPtr != NULL)
00539     {
00540         record.setReference(myRefPtr);
00541     }
00542 
00543     // File is open for writing and the header has been written, so write the
00544     // record.
00545     myStatus = myInterfacePtr->writeRecord(myFilePtr, header, record,
00546                                            myWriteTranslation);
00547 
00548     if(myStatus == SamStatus::SUCCESS)
00549     {
00550         // A record was successfully written, so increment the record count.
00551         myRecordCount++;
00552         return(true);
00553     }
00554     return(false);
00555 }
00556 
00557 
00558 // Set the flag to validate that the file is sorted as it is read/written.
00559 // Must be called after the file has been opened.
00560 void SamFile::setSortedValidation(SortedType sortType)
00561 {
00562     mySortedType = sortType;
00563 }
00564 
00565 
00566 // Return the number of records that have been read/written so far.
00567 uint32_t SamFile::GetCurrentRecordCount()
00568 {
00569     return(myRecordCount);
00570 }
00571 
00572 
00573 // Sets what part of the SamFile should be read.
00574 bool SamFile::SetReadSection(int32_t refID)
00575 {
00576     // No start/end specified, so set back to default -1.
00577     return(SetReadSection(refID, -1, -1));
00578 }
00579 
00580 
00581 
00582 // Sets what part of the SamFile should be read.
00583 bool SamFile::SetReadSection(const char* refName)
00584 {
00585     // No start/end specified, so set back to default -1.
00586     return(SetReadSection(refName, -1, -1));
00587 }
00588 
00589 
00590 // Sets what part of the BAM file should be read.
00591 bool SamFile::SetReadSection(int32_t refID, int32_t start, int32_t end)
00592 {
00593     // If there is not a BAM file open for reading, return failure.
00594     // Opening a new file clears the read section, so it must be
00595     // set after the file is opened.
00596     if(!myIsBamOpenForRead)
00597     {
00598         // There is not a BAM file open for reading.
00599         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00600                            "Canot set section since there is no bam file open");
00601         return(false);
00602     }
00603 
00604     myNewSection = true;
00605     myStartPos = start;
00606     myEndPos = end;
00607     myRefID = refID;
00608     myRefName.clear();
00609     myChunksToRead.clear();
00610     // Reset the end of the current chunk.  We are resetting our read, so
00611     // we no longer have a "current chunk" that we are reading.
00612     myCurrentChunkEnd = 0;
00613     myStatus = SamStatus::SUCCESS;
00614     
00615     return(true);
00616 }
00617 
00618 
00619 // Sets what part of the BAM file should be read.
00620 bool SamFile::SetReadSection(const char* refName, int32_t start, int32_t end)
00621 {
00622     // If there is not a BAM file open for reading, return failure.
00623     // Opening a new file clears the read section, so it must be
00624     // set after the file is opened.
00625     if(!myIsBamOpenForRead)
00626     {
00627         // There is not a BAM file open for reading.
00628         myStatus.setStatus(SamStatus::FAIL_ORDER, 
00629                            "Canot set section since there is no bam file open");
00630         return(false);
00631     }
00632 
00633     myNewSection = true;
00634     myStartPos = start;
00635     myEndPos = end;
00636     if((strcmp(refName, "") == 0) || (strcmp(refName, "*") == 0))
00637     {
00638         // No Reference name specified, so read just the "-1" entries.
00639         myRefID = BamIndex::REF_ID_UNMAPPED;
00640     }
00641     else
00642     {
00643         // save the reference name and revert the reference ID to unknown
00644         // so it will be calculated later.
00645         myRefName = refName;
00646         myRefID = BamIndex::REF_ID_ALL;
00647     }
00648     myChunksToRead.clear();
00649     // Reset the end of the current chunk.  We are resetting our read, so
00650     // we no longer have a "current chunk" that we are reading.
00651     myCurrentChunkEnd = 0;
00652     myStatus = SamStatus::SUCCESS;
00653     
00654     return(true);
00655 }
00656 
00657 
00658 // Returns the number of bases in the passed in read that overlap the
00659 // region that is currently set.
00660 uint32_t SamFile::GetNumOverlaps(SamRecord& samRecord)
00661 {
00662     if(myRefPtr != NULL)
00663     {
00664         samRecord.setReference(myRefPtr);
00665     }
00666     samRecord.setSequenceTranslation(myReadTranslation);
00667 
00668     // Get the overlaps in the sam record for the region currently set
00669     // for this file.
00670     return(samRecord.getNumOverlaps(myStartPos, myEndPos));
00671 }
00672 
00673 
00674 void SamFile::GenerateStatistics(bool genStats)
00675 {
00676     if(genStats)
00677     {
00678         if(myStatistics == NULL)
00679         {
00680             // Want to generate statistics, but do not yet have the
00681             // structure for them, so create one.
00682             myStatistics = new SamStatistics();
00683         }
00684     }
00685     else
00686     {
00687         // Do not generate statistics, so if myStatistics is not NULL, 
00688         // delete it.
00689         if(myStatistics != NULL)
00690         {
00691             delete myStatistics;
00692             myStatistics = NULL;
00693         }
00694     }
00695 
00696 }
00697 
00698 
00699 // Reset variables for each file.
00700 void SamFile::resetFile()
00701 {
00702     // Close the file.
00703     if (myFilePtr != NULL)
00704     {
00705         // If we already have an open file, close it.
00706         ifclose(myFilePtr);
00707         myFilePtr = NULL;
00708     }
00709     if(myInterfacePtr != NULL)
00710     {
00711         delete myInterfacePtr;
00712         myInterfacePtr = NULL;
00713     }
00714 
00715     myIsOpenForRead = false;
00716     myIsOpenForWrite = false;
00717     myHasHeader = false;
00718     mySortedType = UNSORTED;
00719     myPrevReadName = "";
00720     myPrevCoord = -1;
00721     myPrevRefID = 0;
00722     myRecordCount = 0;
00723     myStatus = SamStatus::SUCCESS;
00724 
00725     // Reset indexed bam values.
00726     myIsBamOpenForRead = false;
00727     myRefID = BamIndex::REF_ID_ALL;
00728     myStartPos = -1;
00729     myEndPos = -1;
00730     myNewSection = false;
00731     myCurrentChunkEnd = 0;
00732     myChunksToRead.clear();
00733     if(myBamIndex != NULL)
00734     {
00735         delete myBamIndex;
00736         myBamIndex = NULL;
00737     }
00738 
00739     // If statistics are being generated, reset them.
00740     if(myStatistics != NULL)
00741     {
00742         myStatistics->reset();
00743     }
00744 
00745     myRefName.clear();
00746 }
00747 
00748 
00749 // Validate that the record is sorted compared to the previously read record
00750 // if there is one, according to the specified sort order.
00751 // If the sort order is UNSORTED, true is returned.
00752 bool SamFile::validateSortOrder(SamRecord& record, SamFileHeader& header)
00753 {
00754     if(myRefPtr != NULL)
00755     {
00756         record.setReference(myRefPtr);
00757     }
00758     record.setSequenceTranslation(myReadTranslation);
00759 
00760     bool status = false;
00761     if(mySortedType == UNSORTED)
00762     {
00763         // Unsorted, so nothing to validate, just return true.
00764         status = true;
00765     }
00766     else 
00767     {
00768         // Check to see if mySortedType is based on the header.
00769         if(mySortedType == FLAG)
00770         {
00771             // Determine the sorted type from what was read out of the header.
00772             mySortedType = getSortOrderFromHeader(header);
00773         }
00774 
00775         if(mySortedType == QUERY_NAME)
00776         {
00777             // Validate that it is sorted by query name.
00778             // Get the query name from the record.
00779             const char* readName = record.getReadName();
00780             if(myPrevReadName.compare(readName) > 0)
00781             {
00782                 // The previous name is greater than the new record's name, so
00783                 // return false.
00784                 String errorMessage = "ERROR: File is not sorted at record ";
00785                 errorMessage += myRecordCount;
00786                 myStatus.setStatus(SamStatus::INVALID_SORT, 
00787                                    errorMessage.c_str());
00788                 status = false;
00789             }
00790             else
00791             {
00792                 myPrevReadName = readName;
00793                 status = true;
00794             }
00795         }
00796         else 
00797         {
00798             // Validate that it is sorted by COORDINATES.
00799             // Get the leftmost coordinate and the reference index.
00800             int32_t refID = record.getReferenceID();
00801             int32_t coord = record.get0BasedPosition();
00802             // The unmapped reference id is at the end of a sorted file.
00803             if(refID == BamIndex::REF_ID_UNMAPPED)
00804             {
00805                 // A new reference ID that is for the unmapped reads
00806                 // is always valid.
00807                 status = true;
00808                 myPrevRefID = refID;
00809                 myPrevCoord = coord;
00810             }
00811             else if(myPrevRefID == BamIndex::REF_ID_UNMAPPED)
00812             {
00813                 // Previous reference ID was for unmapped reads, but the
00814                 // current one is not, so this is not sorted.
00815                 String errorMessage = "ERROR: File is not sorted at record ";
00816                 errorMessage += myRecordCount;
00817                 myStatus.setStatus(SamStatus::INVALID_SORT, 
00818                                    errorMessage.c_str());
00819                 status = false;
00820             }
00821             else if(refID < myPrevRefID)
00822             {
00823                 // Current reference id is less than the previous one, 
00824                 //meaning that it is not sorted.
00825                 String errorMessage = "ERROR: File is not sorted at record ";
00826                 errorMessage += myRecordCount;
00827                 myStatus.setStatus(SamStatus::INVALID_SORT, 
00828                                    errorMessage.c_str());
00829                 status = false;
00830             }
00831             else
00832             {
00833                 // The reference IDs are in the correct order.
00834                 if(refID > myPrevRefID)
00835                 {
00836                     // New reference id, so set the previous coordinate to -1
00837                     myPrevCoord = -1;
00838                 }
00839             
00840                 // Check the coordinates.
00841                 if(coord < myPrevCoord)
00842                 {
00843                     // New Coord is less than the previous position.
00844                     String errorMessage = "ERROR: File is not sorted at record ";
00845                     errorMessage += myRecordCount;
00846                     myStatus.setStatus(SamStatus::INVALID_SORT, 
00847                                        errorMessage.c_str());
00848                     status = false;
00849                 }
00850                 else
00851                 {
00852                     myPrevRefID = refID;
00853                     myPrevCoord = coord;
00854                     status = true;
00855                 }
00856             }
00857         }
00858     }
00859 
00860     return(status);
00861 }
00862 
00863 
00864 SamFile::SortedType SamFile::getSortOrderFromHeader(SamFileHeader& header)
00865 {
00866     const char* tag = header.getSortOrder();
00867    
00868     // Default to unsorted since if it is not specified in the header
00869     // that is the value that should be used.
00870     SortedType headerSortOrder = UNSORTED;
00871     if(strcmp(tag, "queryname") == 0)
00872     {
00873         headerSortOrder = QUERY_NAME;
00874     }
00875     else if(strcmp(tag, "coordinate") == 0)
00876     {
00877         headerSortOrder = COORDINATE;
00878     }
00879     return(headerSortOrder);
00880 }
00881 
00882 
00883 // Read a record from the currently opened file based on the indexing info.
00884 bool SamFile::readIndexedRecord(SamFileHeader& header, 
00885                                 SamRecord& record)
00886 {
00887     record.setReference(myRefPtr);
00888     record.setSequenceTranslation(myReadTranslation);
00889 
00890     // A section to read is set and the file is open for reading, as verified
00891     // prior to calling into this protected method.
00892     bool recordFound = false;
00893     while(!recordFound)
00894     {
00895         // Check to see if we have more to read out of the current chunk.
00896         // By checking the current position in relation to the current
00897         // end chunk.  If the current position is >= the end of the
00898         // current chunk, then we must see to the next chunk.
00899         uint64_t currentPos = iftell(myFilePtr);
00900         if(currentPos >= myCurrentChunkEnd)
00901         {
00902             // If there are no more chunks to process, return failure.
00903             if(myChunksToRead.empty())
00904             {
00905                 myStatus = SamStatus::NO_MORE_RECS;
00906                 return(false);
00907             }
00908       
00909             // There are more chunks left, so get the next chunk.
00910             Chunk newChunk = myChunksToRead.pop();
00911 
00912             // Move to the location of the new chunk if it is not adjacent
00913             // to the current chunk.
00914             if(newChunk.chunk_beg != currentPos)
00915             {
00916                 // New chunk is not adjacent, so move to it.
00917                 if(ifseek(myFilePtr, newChunk.chunk_beg, SEEK_SET) != true)
00918                 {
00919                     // seek failed, cannot retrieve next record, return failure.
00920                     myStatus.setStatus(SamStatus::FAIL_IO, 
00921                                        "Failed to seek to the next record");
00922                     return(false);
00923                 }
00924             }
00925             // Seek succeeded, set the end of the new chunk.
00926             myCurrentChunkEnd = newChunk.chunk_end;
00927         }
00928 
00929         // At the position of the next record.  Read the record.
00930         myInterfacePtr->readRecord(myFilePtr, header, record, myStatus);
00931         if(myStatus != SamStatus::SUCCESS)
00932         {
00933             // Reading the record failed, return failure.
00934             return(false);
00935         }
00936 
00937         // Successfully read the record.  Check to see if it is in the correct 
00938         // reference/position.
00939         if(record.getReferenceID() != myRefID)
00940         {
00941             // Incorrect reference ID, return failure.
00942             myStatus = SamStatus::NO_MORE_RECS;
00943             return(false);
00944         }
00945    
00946         // Found a record.
00947         recordFound = true;
00948 
00949         // If start/end position are set, verify that the alignment falls
00950         // completely within those.
00951         if((myStartPos != -1) && (myEndPos != -1))
00952         {
00953             // Start & end are specified, so check if the alignment 
00954             // overlaps this section.
00955 
00956             // If the alignment start is greater than the end of the region,
00957             // return NO_MORE_RECS.
00958             if(record.get0BasedPosition() >= myEndPos)
00959             {
00960                 myStatus = SamStatus::NO_MORE_RECS;
00961                 return(false);
00962             }
00963 
00964             // We know the start is less than the end position, so the alignment
00965             // overlaps the region if the alignment end position is greater than the
00966             // start of the region.
00967             if(record.get0BasedAlignmentEnd() < myStartPos)
00968             {
00969                 // If it does not overlap the region, so go to the next
00970                 // record...set recordFound back to false.
00971                 recordFound = false;
00972             }
00973         }
00974     }
00975 
00976     // Successfully read the record, so check the sort order.
00977     if(!validateSortOrder(record, header))
00978     {
00979         myStatus.setStatus(SamStatus::INVALID_SORT, 
00980                            "The file is not properly sorted.");
00981         return(false);
00982     }
00983    
00984     // If it made it here, then the record was successfully read
00985     // and met the criteria, so return success.
00986     myRecordCount++;
00987 
00988     if(myStatistics != NULL)
00989     {
00990         // Successfully read, statistics should be updated.
00991         myStatistics->updateStatistics(record);
00992     }
00993     
00994     return(true);
00995 }
00996 
00997 
00998 bool SamFile::processNewSection(SamFileHeader &header)
00999 {
01000     myNewSection = false;
01001 
01002     // If there is no index file, return failure.
01003     if(myBamIndex == NULL)
01004     {
01005         // No bam index has been read.
01006         myStatus.setStatus(SamStatus::FAIL_ORDER, 
01007                            "Canot read section since there is no index file open");
01008         throw(std::runtime_error("SOFTWARE BUG: trying to read a BAM record by section prior to opening the BAM Index file."));
01009         return(false);
01010     }
01011 
01012     // If there is not a BAM file open for reading, return failure.
01013     if(!myIsBamOpenForRead)
01014     {
01015         // There is not a BAM file open for reading.
01016         myStatus.setStatus(SamStatus::FAIL_ORDER, 
01017                            "Canot read section since there is no bam file open");
01018         throw(std::runtime_error("SOFTWARE BUG: trying to read a BAM record by section without opening a BAM file."));
01019         return(false);
01020     }
01021 
01022     if(myHasHeader == false)
01023     {
01024         // The header has not yet been read.
01025         myStatus.setStatus(SamStatus::FAIL_ORDER, 
01026                            "Cannot read record since the header has not been read.");
01027         throw(std::runtime_error("SOFTWARE BUG: trying to read a BAM record by section prior to opening the header."));
01028         return(false);
01029     }
01030 
01031     myChunksToRead.clear();
01032     // Reset the end of the current chunk.  We are resetting our read, so
01033     // we no longer have a "current chunk" that we are reading.
01034     myCurrentChunkEnd = 0;
01035 
01036     // Check to see if the read section was set based on the reference name
01037     // but not yet converted to reference id.
01038     if(!myRefName.empty())
01039     {
01040         myRefID = header.getReferenceID(myRefName.c_str());
01041         // Clear the myRefName length so this code is only executed once.
01042         myRefName.clear();
01043     }
01044 
01045     // Get the chunks associated with this reference region.
01046     if(myBamIndex->getChunksForRegion(myRefID, myStartPos, myEndPos, 
01047                                       myChunksToRead) == true)
01048     {
01049         myStatus = SamStatus::SUCCESS;
01050     }
01051     else
01052     {
01053         myStatus.setStatus(SamStatus::FAIL_PARSE, 
01054                            "Failed to get the specified region.");
01055     }
01056     return(true);
01057 }
01058 
01059 
01060 // Default Constructor.
01061 SamFileReader::SamFileReader()
01062 {
01063 }
01064 
01065 
01066 // Constructor that opens the specified file for read.
01067 SamFileReader::SamFileReader(const char* filename)
01068 {
01069     if(!OpenForRead(filename))
01070     {
01071         // Failed to open for reading - print error and abort.
01072         fprintf(stderr, "%s\n", GetStatusMessage());
01073         std::cerr << "FAILURE - EXITING!!!" << std::endl;
01074         exit(-1);
01075     }
01076 }
01077 
01078 
01079 SamFileReader::~SamFileReader()
01080 {
01081 }
01082 
01083 
01084 // Default Constructor.
01085 SamFileWriter::SamFileWriter()
01086 {
01087 }
01088 
01089 
01090 // Constructor that opens the specified file for write.
01091 SamFileWriter::SamFileWriter(const char* filename)
01092 {
01093     if(!OpenForWrite(filename))
01094     {
01095         // Failed to open for reading - print error and abort.
01096         fprintf(stderr, "%s\n", GetStatusMessage());
01097         std::cerr << "FAILURE - EXITING!!!" << std::endl;
01098         exit(-1);
01099     }
01100 }
01101 
01102 
01103 SamFileWriter::~SamFileWriter()
01104 {
01105 }
Generated on Thu Dec 9 12:22:12 2010 for StatGen Software by  doxygen 1.6.3