libStatGen Software  1
Tabix.cpp
00001 /*
00002  *  Copyright (C) 2012-2013  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 #include "Tabix.h"
00018 #include <stdexcept>
00019 #include "StringBasics.h"
00020 
00021 Tabix::Tabix()
00022     : IndexBase(),
00023       myChromNamesBuffer(NULL)
00024 {
00025 }
00026 
00027 
00028 Tabix::~Tabix()
00029 {
00030     if(myChromNamesBuffer != NULL)
00031     {
00032         delete[] myChromNamesBuffer;
00033         myChromNamesBuffer = NULL;
00034     }
00035 }
00036 
00037 
00038 // Reset the member data for a new index file.
00039 void Tabix::resetIndex()
00040 {
00041     IndexBase::resetIndex();
00042     if(myChromNamesBuffer != NULL)
00043     {
00044         delete[] myChromNamesBuffer;
00045         myChromNamesBuffer = NULL;
00046     }
00047     myChromNamesVector.clear();
00048 }
00049 
00050 
00051 // Read & parse the specified index file.
00052 StatGenStatus::Status Tabix::readIndex(const char* filename)
00053 {
00054     // Reset the index from anything that may previously be set.
00055     resetIndex();
00056 
00057     IFILE indexFile = ifopen(filename, "rb");
00058 
00059     // Failed to open the index file.
00060     if(indexFile == NULL)
00061     {
00062         return(StatGenStatus::FAIL_IO);
00063     }
00064 
00065     // read the tabix index structure.
00066 
00067     // Read the magic string.
00068     char magic[4];
00069     if(ifread(indexFile, magic, 4) != 4)
00070     {
00071         // Failed to read the magic
00072         return(StatGenStatus::FAIL_IO);
00073     }
00074 
00075     // If this is not an index file, set num references to 0. 
00076     if (magic[0] != 'T' || magic[1] != 'B' || magic[2] != 'I' || magic[3] != 1)
00077     {
00078         // Not a Tabix Index file.
00079         return(StatGenStatus::FAIL_PARSE);
00080     }
00081 
00082     // It is a tabix index file.
00083     // Read the number of reference sequences.
00084     if(ifread(indexFile, &n_ref, 4) != 4)
00085     {
00086         // Failed to read.
00087         return(StatGenStatus::FAIL_IO);
00088     }
00089 
00090     // Size the references.
00091     myRefs.resize(n_ref);
00092 
00093     // Read the Format configuration.
00094     if(ifread(indexFile, &myFormat, sizeof(myFormat)) != sizeof(myFormat))
00095     {
00096         // Failed to read.
00097         return(StatGenStatus::FAIL_IO);
00098     }
00099 
00100     // Read the length of the chromosome names.
00101     uint32_t l_nm;
00102 
00103     if(ifread(indexFile, &l_nm, sizeof(l_nm)) != sizeof(l_nm))
00104     {
00105         // Failed to read.
00106         return(StatGenStatus::FAIL_IO);
00107     }
00108 
00109     // Read the chromosome names.
00110     myChromNamesBuffer = new char[l_nm];
00111     if(ifread(indexFile, myChromNamesBuffer, l_nm) != l_nm)
00112     {
00113         return(StatGenStatus::FAIL_IO);
00114     }
00115     myChromNamesVector.resize(n_ref);
00116 
00117     // Parse out the chromosome names.
00118     bool prevNull = true;
00119     int chromIndex = 0;
00120     for(uint32_t i = 0; i < l_nm; i++)
00121     {
00122         if(chromIndex >= n_ref)
00123         {
00124             // already set the pointer for the last chromosome name, 
00125             // so stop looping.
00126             break;
00127         }
00128         if(prevNull == true)
00129         {
00130             myChromNamesVector[chromIndex++] = myChromNamesBuffer + i;
00131             prevNull = false;
00132         }
00133         if(myChromNamesBuffer[i] == '\0')
00134         {
00135             prevNull = true;
00136         }
00137     }
00138 
00139     for(int refIndex = 0; refIndex < n_ref; refIndex++)
00140     {
00141         // Read each reference.
00142         Reference* ref = &(myRefs[refIndex]);
00143         
00144         // Resize the bins so they can be indexed by bin number.
00145         ref->bins.resize(MAX_NUM_BINS + 1);
00146         
00147         // Read the number of bins.
00148         if(ifread(indexFile, &(ref->n_bin), 4) != 4)
00149         {
00150             // Failed to read the number of bins.
00151             // Return failure.
00152             return(StatGenStatus::FAIL_PARSE);
00153         }
00154 
00155         // Read each bin.
00156         for(int binIndex = 0; binIndex < ref->n_bin; binIndex++)
00157         {
00158             uint32_t binNumber;
00159 
00160             // Read in the bin number.
00161             if(ifread(indexFile, &(binNumber), 4) != 4)
00162             {
00163                 // Failed to read the bin number.
00164                 // Return failure.
00165                 return(StatGenStatus::FAIL_IO);
00166             }
00167 
00168             // Add the bin to the reference and get the
00169             // pointer back so the values can be set in it.
00170             Bin* binPtr = &(ref->bins[binNumber]);
00171             binPtr->bin = binNumber;
00172          
00173             // Read in the number of chunks.
00174             if(ifread(indexFile, &(binPtr->n_chunk), 4) != 4)
00175             {
00176                 // Failed to read number of chunks.
00177                 // Return failure.
00178                 return(StatGenStatus::FAIL_IO);
00179             }
00180 
00181             // Read in the chunks.
00182             // Allocate space for the chunks.
00183             uint32_t sizeOfChunkList = binPtr->n_chunk * sizeof(Chunk);
00184             binPtr->chunks = (Chunk*)malloc(sizeOfChunkList);
00185             if(ifread(indexFile, binPtr->chunks, sizeOfChunkList) != sizeOfChunkList)
00186             {
00187                 // Failed to read the chunks.
00188                 // Return failure.
00189                 return(StatGenStatus::FAIL_IO);
00190             }
00191         }
00192 
00193         // Read the number of intervals.
00194         if(ifread(indexFile, &(ref->n_intv), 4) != 4)
00195         {
00196             // Failed to read, set to 0.
00197             ref->n_intv = 0;
00198             // Return failure.
00199             return(StatGenStatus::FAIL_IO);
00200         }
00201 
00202         // Allocate space for the intervals and read them.
00203         uint32_t linearIndexSize = ref->n_intv * sizeof(uint64_t);
00204         ref->ioffsets = (uint64_t*)malloc(linearIndexSize);
00205         if(ifread(indexFile, ref->ioffsets, linearIndexSize) != linearIndexSize)
00206         {
00207             // Failed to read the linear index.
00208             // Return failure.
00209             return(StatGenStatus::FAIL_IO);
00210         }
00211     }
00212 
00213     // Successfully read teh bam index file.
00214     return(StatGenStatus::SUCCESS);
00215 }
00216 
00217 
00218 bool Tabix::getStartPos(const char* refName, int32_t start,
00219                         uint64_t& fileStartPos) const
00220 {
00221     // Look for the reference name in the list.
00222     int refID = 0;
00223     for(refID = 0; refID < n_ref; refID++)
00224     {
00225         if(strcmp(refName, myChromNamesVector[refID]) == 0)
00226         {
00227             // found the reference
00228             break;
00229         }
00230     }
00231     if(refID >= n_ref)
00232     {
00233         // Didn't find the refName, so return false.
00234         return(false);
00235     }
00236 
00237     // Look up in the linear index.
00238     if(start < 0)
00239     {
00240         // Negative index, so start at 0.
00241         start = 0;
00242     }
00243     return(getMinOffsetFromLinearIndex(refID, start, fileStartPos));
00244 }
00245 
00246 
00247 const char* Tabix::getRefName(unsigned int indexNum) const
00248 {
00249     if(indexNum >= myChromNamesVector.size())
00250     {
00251         String message = "ERROR: Out of range on Tabix::getRefName(";
00252         message += indexNum;
00253         message += ")";
00254         throw(std::runtime_error(message.c_str()));
00255         return(NULL);
00256     }
00257     return(myChromNamesVector[indexNum]);
00258 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends