Tabix.cpp

00001 /*
00002  *  Copyright (C) 2012  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 #include "Tabix.h"
00018 
00019 Tabix::Tabix()
00020     : IndexBase(),
00021       myChromNamesBuffer(NULL)
00022 {
00023 }
00024 
00025 
00026 Tabix::~Tabix()
00027 {
00028     if(myChromNamesBuffer != NULL)
00029     {
00030         delete myChromNamesBuffer;
00031         myChromNamesBuffer = NULL;
00032     }
00033 }
00034 
00035 
00036 // Reset the member data for a new index file.
00037 void Tabix::resetIndex()
00038 {
00039     IndexBase::resetIndex();
00040     if(myChromNamesBuffer != NULL)
00041     {
00042         delete myChromNamesBuffer;
00043         myChromNamesBuffer = NULL;
00044     }
00045     myChromNamesVector.clear();
00046 }
00047 
00048 
00049 // Read & parse the specified index file.
00050 StatGenStatus::Status Tabix::readIndex(const char* filename)
00051 {
00052     // Reset the index from anything that may previously be set.
00053     resetIndex();
00054 
00055     IFILE indexFile = ifopen(filename, "rb");
00056 
00057     // Failed to open the index file.
00058     if(indexFile == NULL)
00059     {
00060         return(StatGenStatus::FAIL_IO);
00061     }
00062 
00063     // read the tabix index structure.
00064 
00065     // Read the magic string.
00066     char magic[4];
00067     if(ifread(indexFile, magic, 4) != 4)
00068     {
00069         // Failed to read the magic
00070         return(StatGenStatus::FAIL_IO);
00071     }
00072 
00073     // If this is not an index file, set num references to 0. 
00074     if (magic[0] != 'T' || magic[1] != 'B' || magic[2] != 'I' || magic[3] != 1)
00075     {
00076         // Not a Tabix Index file.
00077         return(StatGenStatus::FAIL_PARSE);
00078     }
00079 
00080     // It is a tabix index file.
00081     // Read the number of reference sequences.
00082     if(ifread(indexFile, &n_ref, 4) != 4)
00083     {
00084         // Failed to read.
00085         return(StatGenStatus::FAIL_IO);
00086     }
00087 
00088     // Size the references.
00089     myRefs.resize(n_ref);
00090 
00091     // Read the Format configuration.
00092     if(ifread(indexFile, &myFormat, sizeof(myFormat)) != sizeof(myFormat))
00093     {
00094         // Failed to read.
00095         return(StatGenStatus::FAIL_IO);
00096     }
00097 
00098     // Read the length of the chromosome names.
00099     uint32_t l_nm;
00100 
00101     if(ifread(indexFile, &l_nm, sizeof(l_nm)) != sizeof(l_nm))
00102     {
00103         // Failed to read.
00104         return(StatGenStatus::FAIL_IO);
00105     }
00106 
00107     // Read the chromosome names.
00108     myChromNamesBuffer = new char[l_nm];
00109     if(ifread(indexFile, myChromNamesBuffer, l_nm) != l_nm)
00110     {
00111         return(StatGenStatus::FAIL_IO);
00112     }
00113     myChromNamesVector.resize(n_ref);
00114 
00115     // Parse out the chromosome names.
00116     bool prevNull = true;
00117     int chromIndex = 0;
00118     for(uint32_t i = 0; i < l_nm; i++)
00119     {
00120         if(chromIndex >= n_ref)
00121         {
00122             // already set the pointer for the last chromosome name, 
00123             // so stop looping.
00124             break;
00125         }
00126         if(prevNull == true)
00127         {
00128             myChromNamesVector[chromIndex++] = myChromNamesBuffer + i;
00129             prevNull = false;
00130         }
00131         if(myChromNamesBuffer[i] == '\0')
00132         {
00133             prevNull = true;
00134         }
00135     }
00136 
00137     for(int refIndex = 0; refIndex < n_ref; refIndex++)
00138     {
00139         // Read each reference.
00140         Reference* ref = &(myRefs[refIndex]);
00141         
00142         // Resize the bins so they can be indexed by bin number.
00143         ref->bins.resize(MAX_NUM_BINS + 1);
00144         
00145         // Read the number of bins.
00146         if(ifread(indexFile, &(ref->n_bin), 4) != 4)
00147         {
00148             // Failed to read the number of bins.
00149             // Return failure.
00150             return(StatGenStatus::FAIL_PARSE);
00151         }
00152 
00153         // Read each bin.
00154         for(int binIndex = 0; binIndex < ref->n_bin; binIndex++)
00155         {
00156             uint32_t binNumber;
00157 
00158             // Read in the bin number.
00159             if(ifread(indexFile, &(binNumber), 4) != 4)
00160             {
00161                 // Failed to read the bin number.
00162                 // Return failure.
00163                 return(StatGenStatus::FAIL_IO);
00164             }
00165 
00166             // Add the bin to the reference and get the
00167             // pointer back so the values can be set in it.
00168             Bin* binPtr = &(ref->bins[binNumber]);
00169             binPtr->bin = binNumber;
00170          
00171             // Read in the number of chunks.
00172             if(ifread(indexFile, &(binPtr->n_chunk), 4) != 4)
00173             {
00174                 // Failed to read number of chunks.
00175                 // Return failure.
00176                 return(StatGenStatus::FAIL_IO);
00177             }
00178 
00179             // Read in the chunks.
00180             // Allocate space for the chunks.
00181             uint32_t sizeOfChunkList = binPtr->n_chunk * sizeof(Chunk);
00182             binPtr->chunks = (Chunk*)malloc(sizeOfChunkList);
00183             if(ifread(indexFile, binPtr->chunks, sizeOfChunkList) != sizeOfChunkList)
00184             {
00185                 // Failed to read the chunks.
00186                 // Return failure.
00187                 return(StatGenStatus::FAIL_IO);
00188             }
00189         }
00190 
00191         // Read the number of intervals.
00192         if(ifread(indexFile, &(ref->n_intv), 4) != 4)
00193         {
00194             // Failed to read, set to 0.
00195             ref->n_intv = 0;
00196             // Return failure.
00197             return(StatGenStatus::FAIL_IO);
00198         }
00199 
00200         // Allocate space for the intervals and read them.
00201         uint32_t linearIndexSize = ref->n_intv * sizeof(uint64_t);
00202         ref->ioffsets = (uint64_t*)malloc(linearIndexSize);
00203         if(ifread(indexFile, ref->ioffsets, linearIndexSize) != linearIndexSize)
00204         {
00205             // Failed to read the linear index.
00206             // Return failure.
00207             return(StatGenStatus::FAIL_IO);
00208         }
00209     }
00210 
00211     // Successfully read teh bam index file.
00212     return(StatGenStatus::SUCCESS);
00213 }
00214 
00215 
00216 bool Tabix::getStartPos(const char* refName, int32_t start,
00217                         uint64_t& fileStartPos)
00218 {
00219     // Look for the reference name in the list.
00220     int refID = 0;
00221     for(refID = 0; refID < n_ref; refID++)
00222     {
00223         if(strcmp(refName, myChromNamesVector[refID]) == 0)
00224         {
00225             // found the reference
00226             break;
00227         }
00228     }
00229     if(refID >= n_ref)
00230     {
00231         // Didn't find the refName, so return false.
00232         return(false);
00233     }
00234 
00235     // Look up in the linear index.
00236     if(start < 0)
00237     {
00238         // Negative index, so start at 0.
00239         start = 0;
00240     }
00241     return(getMinOffsetFromLinearIndex(refID, start, fileStartPos));
00242 }
Generated on Mon Feb 11 13:45:19 2013 for libStatGen Software by  doxygen 1.6.3