libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #include "SamFile.h" 00019 #include "ModifyVar.h" 00020 #include "SamValidation.h" 00021 00022 void modifyFirstBase() 00023 { 00024 SamFile samIn; 00025 // Open the file for reading. 00026 assert(samIn.OpenForRead("testFiles/testVar.bam")); 00027 00028 SamFile samOut; 00029 // Open the file for writing. 00030 assert(samOut.OpenForWrite("results/updateVar.bam")); 00031 00032 // Read the sam header. 00033 SamFileHeader samHeader; 00034 assert(samIn.ReadHeader(samHeader)); 00035 00036 assert(samOut.WriteHeader(samHeader)); 00037 00038 // Read the sam records. 00039 SamRecord samRecord; 00040 00041 // Keep reading records until the end of the file is reached. 00042 while(samIn.ReadRecord(samHeader, samRecord)) 00043 { 00044 // Successfully read a record from the file, so check to see 00045 // if it is valid. 00046 SamValidationErrors samValidationErrors; 00047 assert(SamValidator::isValid(samHeader, samRecord, 00048 samValidationErrors)); 00049 00050 // Get the sequence. 00051 const char* sequence = samRecord.getSequence(); 00052 assert(strcmp(sequence, "") != 0); 00053 std::string upSeq = sequence; 00054 upSeq[0] = 'N'; 00055 assert(samRecord.setSequence(upSeq.c_str())); 00056 00057 // write the sequence. 00058 assert(samOut.WriteRecord(samHeader, samRecord)); 00059 } 00060 00061 // Should have exited only when done reading. 00062 assert(samIn.GetStatus() == SamStatus::NO_MORE_RECS); 00063 } 00064 00065 00066 void modifyFirstBaseLong() 00067 { 00068 SamFile samIn; 00069 // Open the file for reading. 00070 assert(samIn.OpenForRead("statusTests/HalfG.bam")); 00071 00072 SamFile samOut; 00073 // Open the file for writing. 00074 assert(samOut.OpenForWrite("results/updateSeq.bam")); 00075 00076 // Read the sam header. 00077 SamFileHeader samHeader; 00078 assert(samIn.ReadHeader(samHeader)); 00079 00080 assert(samOut.WriteHeader(samHeader)); 00081 00082 // Read the sam records. 00083 SamRecord samRecord; 00084 00085 // Keep reading records until the end of the file is reached. 00086 while(samIn.ReadRecord(samHeader, samRecord)) 00087 { 00088 // Successfully read a record from the file, so check to see 00089 // if it is valid. 00090 SamValidationErrors samValidationErrors; 00091 assert(SamValidator::isValid(samHeader, samRecord, 00092 samValidationErrors)); 00093 00094 // Get the sequence. 00095 const char* sequence = samRecord.getSequence(); 00096 assert(strcmp(sequence, "") != 0); 00097 std::string upSeq = sequence; 00098 upSeq[0] = 'N'; 00099 assert(samRecord.setSequence(upSeq.c_str())); 00100 00101 // write the sequence. 00102 assert(samOut.WriteRecord(samHeader, samRecord)); 00103 } 00104 00105 // Should have exited only when done reading. 00106 assert(samIn.GetStatus() == SamStatus::NO_MORE_RECS); 00107 } 00108 00109 00110 void testModifyVar() 00111 { 00112 #ifdef __ZLIB_AVAILABLE__ 00113 modifyFirstBase(); 00114 #endif 00115 modifyVar modTest; 00116 modTest.testModifyVar("testFiles/testSam.sam", true); 00117 modTest.testModifyVar("testFiles/testSam.sam", false); 00118 #ifdef __ZLIB_AVAILABLE__ 00119 modTest.testModifyVar("testFiles/testBam.bam", true); 00120 modTest.testModifyVar("testFiles/testBam.bam", false); 00121 #endif 00122 } 00123 00124 00125 void modifyVar::testModifyVar(const char* filename, bool valBufFirst) 00126 { 00127 myFilename = filename; 00128 myValBufFirst = valBufFirst; 00129 00130 testModifyReadNameOnlySameLength(); 00131 testModifyCigarOnlySameLength(); 00132 testModifySequenceOnlySameLength(); 00133 testModifyQualityOnlySameLength(); 00134 testRemoveQuality(); 00135 testShortenQuality(); 00136 testLengthenQuality(); 00137 00138 testShortenReadName(); 00139 testShortenCigar(); 00140 testShortenSequence(); 00141 00142 testLengthenReadName(); 00143 testLengthenCigar(); 00144 testLengthenSequence(); 00145 00146 testRemoveCigar(); 00147 testRemoveSequence(); 00148 testLengthenSequenceAndQuality(); 00149 } 00150 00151 00152 void modifyVar::testModifyReadNameOnlySameLength() 00153 { 00154 resetExpected(); 00155 openAndRead1Rec(); 00156 00157 // Set the Read Name - same length, just different name. 00158 expectedReadNameString = "1:1011:G:255+17M15D20M"; 00159 samRecord.setReadName(expectedReadNameString.c_str()); 00160 00161 validate(); 00162 } 00163 00164 00165 void modifyVar::testModifyCigarOnlySameLength() 00166 { 00167 resetExpected(); 00168 openAndRead1Rec(); 00169 00170 // Set the Cigar - same length, just different values. 00171 expectedCigarString = "3M2I"; 00172 samRecord.setCigar(expectedCigarString.c_str()); 00173 00174 // The new Cigar for record 1 is 3M2I 00175 // 3M = 3 << 4 | 0 = 0x30 00176 // 2I = 2 << 4 | 1 = 0x21 00177 expectedCigarBufLen = 2; 00178 expectedCigarBuffer[0] = 0x30; 00179 expectedCigarBuffer[1] = 0x21; 00180 00181 validate(); 00182 } 00183 00184 00185 void modifyVar::testModifySequenceOnlySameLength() 00186 { 00187 resetExpected(); 00188 openAndRead1Rec(); 00189 00190 // Set the Sequence - same length, just different values. 00191 expectedSequenceString = "NCGAN"; 00192 samRecord.setSequence(expectedSequenceString.c_str()); 00193 00194 // NCGAN = NC GA N = 0xF2 0x41 0xF0 00195 expectedSequenceBuffer[0] = 0xF2; 00196 expectedSequenceBuffer[1] = 0x41; 00197 expectedSequenceBuffer[2] = 0xF0; 00198 00199 validate(); 00200 } 00201 00202 00203 void modifyVar::testModifyQualityOnlySameLength() 00204 { 00205 resetExpected(); 00206 openAndRead1Rec(); 00207 00208 // Set the Quality - same length, just different values. 00209 expectedQualityString = "!>6+!"; 00210 samRecord.setQuality(expectedQualityString.c_str()); 00211 00212 validate(); 00213 } 00214 00215 00216 void modifyVar::testRemoveQuality() 00217 { 00218 resetExpected(); 00219 openAndRead1Rec(); 00220 00221 // Set the Quality - to "*" - does not affect the length since the 00222 // sequence field drives the length. 00223 expectedQualityString = "*"; 00224 samRecord.setQuality(expectedQualityString.c_str()); 00225 00226 validate(); 00227 } 00228 00229 00230 void modifyVar::testShortenQuality() 00231 { 00232 resetExpected(); 00233 openAndRead1Rec(); 00234 00235 // Set the Quality - shorten, but doesn't affect the length since 00236 // the sequence field drives the length. 00237 expectedQualityString = "!!"; 00238 samRecord.setQuality(expectedQualityString.c_str()); 00239 00240 validate(); 00241 } 00242 00243 00244 void modifyVar::testLengthenQuality() 00245 { 00246 resetExpected(); 00247 openAndRead1Rec(); 00248 00249 // Set the Quality - lengthen, but doesn't affect the length since 00250 // the sequence field drives the length. 00251 expectedQualityString = "!!@@##"; 00252 samRecord.setQuality(expectedQualityString.c_str()); 00253 00254 validate(); 00255 } 00256 00257 00258 void modifyVar::testShortenReadName() 00259 { 00260 resetExpected(); 00261 openAndRead1Rec(); 00262 00263 // Set the Read Name - shorter length 00264 expectedReadNameString = "1:1011:G:255"; 00265 samRecord.setReadName(expectedReadNameString.c_str()); 00266 00267 validate(); 00268 } 00269 00270 00271 void modifyVar::testShortenCigar() 00272 { 00273 resetExpected(); 00274 openAndRead1Rec(); 00275 00276 // Set the Cigar - shorter length 00277 expectedCigarString = "5M"; 00278 samRecord.setCigar(expectedCigarString.c_str()); 00279 00280 // The new Cigar for record 1 is 5M 00281 // 5M = 5 << 4 | 0 = 0x50 00282 expectedCigarBufLen = 1; 00283 expectedCigarBuffer[0] = 0x50; 00284 00285 validate(); 00286 } 00287 00288 00289 void modifyVar::testShortenSequence() 00290 { 00291 resetExpected(); 00292 openAndRead1Rec(); 00293 00294 // Set the Sequence - shorter length. 00295 expectedSequenceString = "CCGA"; 00296 samRecord.setSequence(expectedSequenceString.c_str()); 00297 00298 // CCGA = CC GA = 0x22 0x41 00299 expectedSequenceBuffer[0] = 0x22; 00300 expectedSequenceBuffer[1] = 0x41; 00301 00302 validate(); 00303 } 00304 00305 00306 void modifyVar::testLengthenReadName() 00307 { 00308 resetExpected(); 00309 openAndRead1Rec(); 00310 00311 // Set the Read Name - longer. 00312 expectedReadNameString = "1:1011:G:255+17M15D20M:1111111"; 00313 samRecord.setReadName(expectedReadNameString.c_str()); 00314 00315 validate(); 00316 } 00317 00318 00319 void modifyVar::testLengthenCigar() 00320 { 00321 resetExpected(); 00322 openAndRead1Rec(); 00323 00324 // Set the Cigar - longer length. 00325 expectedCigarString = "3M2D2I"; 00326 samRecord.setCigar(expectedCigarString.c_str()); 00327 00328 // The new Cigar for record 1 is 3M2I 00329 // 3M = 3 << 4 | 0 = 0x30 00330 // 2D = 2 << 2 | 1 = 0x22 00331 // 2I = 2 << 4 | 1 = 0x21 00332 expectedCigarBufLen = 3; 00333 expectedCigarBuffer[0] = 0x30; 00334 expectedCigarBuffer[1] = 0x22; 00335 expectedCigarBuffer[2] = 0x21; 00336 00337 validate(); 00338 } 00339 00340 00341 void modifyVar::testLengthenSequence() 00342 { 00343 resetExpected(); 00344 openAndRead1Rec(); 00345 00346 // Set the Sequence - longer length. 00347 expectedSequenceString = "CCGAATT"; 00348 samRecord.setSequence(expectedSequenceString.c_str()); 00349 00350 // CCGAATT = CC GA AT T = 0x22 0x41 0x18 0x80 00351 expectedSequenceBuffer[0] = 0x22; 00352 expectedSequenceBuffer[1] = 0x41; 00353 expectedSequenceBuffer[2] = 0x18; 00354 expectedSequenceBuffer[3] = 0x80; 00355 00356 validate(); 00357 } 00358 00359 00360 void modifyVar::testRemoveCigar() 00361 { 00362 resetExpected(); 00363 openAndRead1Rec(); 00364 00365 // Set the Cigar - same length, just different values. 00366 expectedCigarString = "*"; 00367 expectedCigarBufLen = 0; 00368 samRecord.setCigar(expectedCigarString.c_str()); 00369 00370 validate(); 00371 } 00372 00373 00374 void modifyVar::testRemoveSequence() 00375 { 00376 resetExpected(); 00377 openAndRead1Rec(); 00378 00379 // Set the Sequence - shorter length. 00380 expectedSequenceString = "*"; 00381 samRecord.setSequence(expectedSequenceString.c_str()); 00382 00383 validate(); 00384 } 00385 00386 00387 void modifyVar::testLengthenSequenceAndQuality() 00388 { 00389 resetExpected(); 00390 openAndRead1Rec(); 00391 00392 // Set the Sequence & quality - longer. 00393 expectedSequenceString = "CCGAATT"; 00394 expectedQualityString = "!@#$%^&"; 00395 samRecord.setSequence(expectedSequenceString.c_str()); 00396 samRecord.setQuality(expectedQualityString.c_str()); 00397 00398 // CCGAATT = CC GA AT T = 0x22 0x41 0x18 0x80 00399 expectedSequenceBuffer[0] = 0x22; 00400 expectedSequenceBuffer[1] = 0x41; 00401 expectedSequenceBuffer[2] = 0x18; 00402 expectedSequenceBuffer[3] = 0x80; 00403 00404 validate(); 00405 } 00406 00407 00408 void modifyVar::validate() 00409 { 00410 if(myValBufFirst) 00411 { 00412 // get the record. 00413 const bamRecordStruct* recordBuffer = 00414 (const bamRecordStruct*)samRecord.getRecordBuffer(); 00415 00416 // Validate the buffer. 00417 validateReadName(recordBuffer); 00418 validateCigar(recordBuffer); 00419 validateSequence(recordBuffer); 00420 validateQuality(recordBuffer); 00421 validateTags(recordBuffer); 00422 00423 // Validate the strings. 00424 validateReadNameString(); 00425 validateCigarString(); 00426 validateSequenceString(); 00427 validateQualityString(); 00428 validateTagsString(); 00429 } 00430 else 00431 { 00432 // get the record. 00433 const bamRecordStruct* recordBuffer = 00434 (const bamRecordStruct*)samRecord.getRecordBuffer(); 00435 00436 // Validate the buffer. 00437 validateReadName(recordBuffer); 00438 validateCigar(recordBuffer); 00439 validateSequence(recordBuffer); 00440 validateQuality(recordBuffer); 00441 validateTags(recordBuffer); 00442 00443 // Validate the strings. 00444 validateReadNameString(); 00445 validateCigarString(); 00446 validateSequenceString(); 00447 validateQualityString(); 00448 validateTagsString(); 00449 } 00450 } 00451 00452 void modifyVar::validateReadName(const bamRecordStruct* recordBuffer) 00453 { 00454 const char* varPtr = (const char*)&(recordBuffer->myData); 00455 00456 unsigned int len = expectedReadNameString.length(); 00457 for(unsigned int i = 0; i < len; i++) 00458 { 00459 assert(varPtr[i] == expectedReadNameString[i]); 00460 } 00461 00462 // Verify ending null. 00463 assert(varPtr[len] == 0); 00464 00465 // verify the length - add one for the terminating null. 00466 assert(recordBuffer->myReadNameLength == 00467 expectedReadNameString.length() + 1); 00468 } 00469 00470 00471 void modifyVar::validateCigar(const bamRecordStruct* recordBuffer) 00472 { 00473 const unsigned char* cigarStart = 00474 (const unsigned char*)&(recordBuffer->myData) 00475 + recordBuffer->myReadNameLength; 00476 00477 unsigned int* varPtr = (unsigned int*)cigarStart; 00478 00479 for(int i = 0; i < expectedCigarBufLen; i++) 00480 { 00481 assert(varPtr[i] == expectedCigarBuffer[i]); 00482 } 00483 00484 assert(recordBuffer->myCigarLength == expectedCigarBufLen); 00485 } 00486 00487 00488 void modifyVar::validateSequence(const bamRecordStruct* recordBuffer) 00489 { 00490 // Calculate the sequence length. 00491 int expectedReadLen = expectedSequenceString.length(); 00492 int seqLen = (expectedReadLen + 1)/2; 00493 if(expectedSequenceString == "*") 00494 { 00495 expectedReadLen = 0; 00496 seqLen = 0; 00497 } 00498 00499 const unsigned char* sequencePtr = 00500 (const unsigned char*)&(recordBuffer->myData) 00501 + recordBuffer->myReadNameLength + (recordBuffer->myCigarLength * 4); 00502 00503 for(int i = 0; i < seqLen; i++) 00504 { 00505 assert(sequencePtr[i] == expectedSequenceBuffer[i]); 00506 } 00507 00508 assert(recordBuffer->myReadLength == expectedReadLen); 00509 } 00510 00511 00512 void modifyVar::validateQuality(const bamRecordStruct* recordBuffer) 00513 { 00514 int expectedReadLen = expectedSequenceString.length(); 00515 int seqLen = (expectedReadLen + 1)/2; 00516 if(expectedSequenceString == "*") 00517 { 00518 expectedReadLen = 0; 00519 seqLen = 0; 00520 } 00521 00522 const uint8_t* qualityPtr = 00523 (const unsigned char*)&(recordBuffer->myData) 00524 + recordBuffer->myReadNameLength + (recordBuffer->myCigarLength * 4) 00525 + seqLen; 00526 00527 int qualityLen = expectedQualityString.length(); 00528 00529 for(int i = 0; i < expectedReadLen; i++) 00530 { 00531 if(expectedQualityString == "*") 00532 { 00533 // no quality, so check for 0xFF. 00534 assert(qualityPtr[i] == 0xFF); 00535 } 00536 else if(i >= qualityLen) 00537 { 00538 // Quality is shorter than the sequence, so should be padded with 00539 // 0xFF. 00540 assert(qualityPtr[i] == 0xFF); 00541 } 00542 else 00543 { 00544 assert(qualityPtr[i] == (expectedQualityString[i] - 33)); 00545 } 00546 } 00547 00548 assert(recordBuffer->myReadLength == expectedReadLen); 00549 } 00550 00551 00552 void modifyVar::validateTags(const bamRecordStruct* recordBuffer) 00553 { 00554 const unsigned char* tagsPtr = 00555 (const unsigned char*)&(recordBuffer->myData) 00556 + recordBuffer->myReadNameLength + (recordBuffer->myCigarLength * 4) 00557 + (recordBuffer->myReadLength + 1)/2 + recordBuffer->myReadLength; 00558 00559 for(int i = 0; i < expectedTagsLen; i++) 00560 { 00561 assert(tagsPtr[i] == expectedTagsBuffer[i]); 00562 } 00563 00564 // Calculate expected block size - from the start of the buffer to the 00565 // start of the tags plus the tags length - minus the size of the blocksize 00566 // field. 00567 int32_t expectedBlockSize = tagsPtr - (const unsigned char*)(recordBuffer) 00568 + expectedTagsLen - 4; 00569 assert(recordBuffer->myBlockSize == expectedBlockSize); 00570 } 00571 00572 00573 void modifyVar::validateTagsString() 00574 { 00575 char tag[3]; 00576 char type; 00577 void* value; 00578 assert(samRecord.getNextSamTag(tag, type, &value) == true); 00579 assert(tag[0] == 'A'); 00580 assert(tag[1] == 'M'); 00581 assert(type == 'i'); 00582 assert(*(char*)value == 0); 00583 assert(samRecord.getNextSamTag(tag, type, &value) == true); 00584 assert(tag[0] == 'M'); 00585 assert(tag[1] == 'D'); 00586 assert(type == 'Z'); 00587 assert(*(String*)value == "37"); 00588 assert(samRecord.getNextSamTag(tag, type, &value) == true); 00589 assert(tag[0] == 'N'); 00590 assert(tag[1] == 'M'); 00591 assert(type == 'i'); 00592 assert(*(char*)value == 0); 00593 assert(samRecord.getNextSamTag(tag, type, &value) == true); 00594 assert(tag[0] == 'X'); 00595 assert(tag[1] == 'T'); 00596 assert(type == 'A'); 00597 assert(*(char*)value == 'R'); 00598 // No more tags, should return false. 00599 assert(samRecord.getNextSamTag(tag, type, &value) == false); 00600 assert(samRecord.getNextSamTag(tag, type, &value) == false); 00601 } 00602 00603 void modifyVar::validateReadNameString() 00604 { 00605 assert(samRecord.getReadName() == expectedReadNameString); 00606 } 00607 00608 void modifyVar::validateCigarString() 00609 { 00610 assert(samRecord.getCigar() == expectedCigarString); 00611 } 00612 00613 void modifyVar::validateSequenceString() 00614 { 00615 assert(samRecord.getSequence() == expectedSequenceString); 00616 } 00617 00618 void modifyVar::validateQualityString() 00619 { 00620 assert(samRecord.getQuality() == expectedQualityString); 00621 } 00622 00623 00624 void modifyVar::resetExpected() 00625 { 00626 expectedReadNameString = "1:1011:F:255+17M15D20M"; 00627 expectedCigarString = "5M2D"; 00628 expectedSequenceString = "CCGAA"; 00629 expectedQualityString = "6>6+4"; 00630 00631 // The default Cigar for record 1 is 5M2D 00632 // 5M = 5 << 4 | 0 = 0x50 00633 // 2D = 2 << 4 | 2 = 0x22 00634 expectedCigarBufLen = 2; 00635 expectedCigarBuffer[0] = 0x50; 00636 expectedCigarBuffer[1] = 0x22; 00637 00638 // CCGAA = CC GA A = 0x22 0x41 0x10 00639 expectedSequenceBuffer[0] = 0x22; 00640 expectedSequenceBuffer[1] = 0x41; 00641 expectedSequenceBuffer[2] = 0x10; 00642 00643 expectedTagsLen = 18; 00644 expectedTagsBuffer[0] = 'A'; 00645 expectedTagsBuffer[1] = 'M'; 00646 expectedTagsBuffer[2] = 'C'; 00647 expectedTagsBuffer[3] = 0; 00648 expectedTagsBuffer[4] = 'M'; 00649 expectedTagsBuffer[5] = 'D'; 00650 expectedTagsBuffer[6] = 'Z'; 00651 expectedTagsBuffer[7] = '3'; 00652 expectedTagsBuffer[8] = '7'; 00653 expectedTagsBuffer[9] = 0; 00654 expectedTagsBuffer[10] = 'N'; 00655 expectedTagsBuffer[11] = 'M'; 00656 expectedTagsBuffer[12] = 'C'; 00657 expectedTagsBuffer[13] = 0; 00658 expectedTagsBuffer[14] = 'X'; 00659 expectedTagsBuffer[15] = 'T'; 00660 expectedTagsBuffer[16] = 'A'; 00661 expectedTagsBuffer[17] = 'R'; 00662 } 00663 00664 00665 void modifyVar::openAndRead1Rec() 00666 { 00667 // Open the file for reading. 00668 assert(samIn.OpenForRead(myFilename)); 00669 00670 // Read the sam header. 00671 assert(samIn.ReadHeader(samHeader)); 00672 00673 // Read the first record. 00674 assert(samIn.ReadRecord(samHeader, samRecord)); 00675 }