libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2012-2013 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 #include "Tabix.h" 00018 #include <stdexcept> 00019 #include "StringBasics.h" 00020 00021 Tabix::Tabix() 00022 : IndexBase(), 00023 myChromNamesBuffer(NULL) 00024 { 00025 } 00026 00027 00028 Tabix::~Tabix() 00029 { 00030 if(myChromNamesBuffer != NULL) 00031 { 00032 delete[] myChromNamesBuffer; 00033 myChromNamesBuffer = NULL; 00034 } 00035 } 00036 00037 00038 // Reset the member data for a new index file. 00039 void Tabix::resetIndex() 00040 { 00041 IndexBase::resetIndex(); 00042 if(myChromNamesBuffer != NULL) 00043 { 00044 delete[] myChromNamesBuffer; 00045 myChromNamesBuffer = NULL; 00046 } 00047 myChromNamesVector.clear(); 00048 } 00049 00050 00051 // Read & parse the specified index file. 00052 StatGenStatus::Status Tabix::readIndex(const char* filename) 00053 { 00054 // Reset the index from anything that may previously be set. 00055 resetIndex(); 00056 00057 IFILE indexFile = ifopen(filename, "rb"); 00058 00059 // Failed to open the index file. 00060 if(indexFile == NULL) 00061 { 00062 return(StatGenStatus::FAIL_IO); 00063 } 00064 00065 // read the tabix index structure. 00066 00067 // Read the magic string. 00068 char magic[4]; 00069 if(ifread(indexFile, magic, 4) != 4) 00070 { 00071 // Failed to read the magic 00072 return(StatGenStatus::FAIL_IO); 00073 } 00074 00075 // If this is not an index file, set num references to 0. 00076 if (magic[0] != 'T' || magic[1] != 'B' || magic[2] != 'I' || magic[3] != 1) 00077 { 00078 // Not a Tabix Index file. 00079 return(StatGenStatus::FAIL_PARSE); 00080 } 00081 00082 // It is a tabix index file. 00083 // Read the number of reference sequences. 00084 if(ifread(indexFile, &n_ref, 4) != 4) 00085 { 00086 // Failed to read. 00087 return(StatGenStatus::FAIL_IO); 00088 } 00089 00090 // Size the references. 00091 myRefs.resize(n_ref); 00092 00093 // Read the Format configuration. 00094 if(ifread(indexFile, &myFormat, sizeof(myFormat)) != sizeof(myFormat)) 00095 { 00096 // Failed to read. 00097 return(StatGenStatus::FAIL_IO); 00098 } 00099 00100 // Read the length of the chromosome names. 00101 uint32_t l_nm; 00102 00103 if(ifread(indexFile, &l_nm, sizeof(l_nm)) != sizeof(l_nm)) 00104 { 00105 // Failed to read. 00106 return(StatGenStatus::FAIL_IO); 00107 } 00108 00109 // Read the chromosome names. 00110 myChromNamesBuffer = new char[l_nm]; 00111 if(ifread(indexFile, myChromNamesBuffer, l_nm) != l_nm) 00112 { 00113 return(StatGenStatus::FAIL_IO); 00114 } 00115 myChromNamesVector.resize(n_ref); 00116 00117 // Parse out the chromosome names. 00118 bool prevNull = true; 00119 int chromIndex = 0; 00120 for(uint32_t i = 0; i < l_nm; i++) 00121 { 00122 if(chromIndex >= n_ref) 00123 { 00124 // already set the pointer for the last chromosome name, 00125 // so stop looping. 00126 break; 00127 } 00128 if(prevNull == true) 00129 { 00130 myChromNamesVector[chromIndex++] = myChromNamesBuffer + i; 00131 prevNull = false; 00132 } 00133 if(myChromNamesBuffer[i] == '\0') 00134 { 00135 prevNull = true; 00136 } 00137 } 00138 00139 for(int refIndex = 0; refIndex < n_ref; refIndex++) 00140 { 00141 // Read each reference. 00142 Reference* ref = &(myRefs[refIndex]); 00143 00144 // Resize the bins so they can be indexed by bin number. 00145 ref->bins.resize(MAX_NUM_BINS + 1); 00146 00147 // Read the number of bins. 00148 if(ifread(indexFile, &(ref->n_bin), 4) != 4) 00149 { 00150 // Failed to read the number of bins. 00151 // Return failure. 00152 return(StatGenStatus::FAIL_PARSE); 00153 } 00154 00155 // Read each bin. 00156 for(int binIndex = 0; binIndex < ref->n_bin; binIndex++) 00157 { 00158 uint32_t binNumber; 00159 00160 // Read in the bin number. 00161 if(ifread(indexFile, &(binNumber), 4) != 4) 00162 { 00163 // Failed to read the bin number. 00164 // Return failure. 00165 return(StatGenStatus::FAIL_IO); 00166 } 00167 00168 // Add the bin to the reference and get the 00169 // pointer back so the values can be set in it. 00170 Bin* binPtr = &(ref->bins[binNumber]); 00171 binPtr->bin = binNumber; 00172 00173 // Read in the number of chunks. 00174 if(ifread(indexFile, &(binPtr->n_chunk), 4) != 4) 00175 { 00176 // Failed to read number of chunks. 00177 // Return failure. 00178 return(StatGenStatus::FAIL_IO); 00179 } 00180 00181 // Read in the chunks. 00182 // Allocate space for the chunks. 00183 uint32_t sizeOfChunkList = binPtr->n_chunk * sizeof(Chunk); 00184 binPtr->chunks = (Chunk*)malloc(sizeOfChunkList); 00185 if(ifread(indexFile, binPtr->chunks, sizeOfChunkList) != sizeOfChunkList) 00186 { 00187 // Failed to read the chunks. 00188 // Return failure. 00189 return(StatGenStatus::FAIL_IO); 00190 } 00191 } 00192 00193 // Read the number of intervals. 00194 if(ifread(indexFile, &(ref->n_intv), 4) != 4) 00195 { 00196 // Failed to read, set to 0. 00197 ref->n_intv = 0; 00198 // Return failure. 00199 return(StatGenStatus::FAIL_IO); 00200 } 00201 00202 // Allocate space for the intervals and read them. 00203 uint32_t linearIndexSize = ref->n_intv * sizeof(uint64_t); 00204 ref->ioffsets = (uint64_t*)malloc(linearIndexSize); 00205 if(ifread(indexFile, ref->ioffsets, linearIndexSize) != linearIndexSize) 00206 { 00207 // Failed to read the linear index. 00208 // Return failure. 00209 return(StatGenStatus::FAIL_IO); 00210 } 00211 } 00212 00213 // Successfully read teh bam index file. 00214 return(StatGenStatus::SUCCESS); 00215 } 00216 00217 00218 bool Tabix::getStartPos(const char* refName, int32_t start, 00219 uint64_t& fileStartPos) const 00220 { 00221 // Look for the reference name in the list. 00222 int refID = 0; 00223 for(refID = 0; refID < n_ref; refID++) 00224 { 00225 if(strcmp(refName, myChromNamesVector[refID]) == 0) 00226 { 00227 // found the reference 00228 break; 00229 } 00230 } 00231 if(refID >= n_ref) 00232 { 00233 // Didn't find the refName, so return false. 00234 return(false); 00235 } 00236 00237 // Look up in the linear index. 00238 if(start < 0) 00239 { 00240 // Negative index, so start at 0. 00241 start = 0; 00242 } 00243 return(getMinOffsetFromLinearIndex(refID, start, fileStartPos)); 00244 } 00245 00246 00247 const char* Tabix::getRefName(unsigned int indexNum) const 00248 { 00249 if(indexNum >= myChromNamesVector.size()) 00250 { 00251 String message = "ERROR: Out of range on Tabix::getRefName("; 00252 message += indexNum; 00253 message += ")"; 00254 throw(std::runtime_error(message.c_str())); 00255 return(NULL); 00256 } 00257 return(myChromNamesVector[indexNum]); 00258 }