ModifyVar.cpp

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include "SamFile.h"
00019 #include "ModifyVar.h"
00020 #include "SamValidation.h"
00021 
00022 void modifyFirstBase()
00023 {
00024     SamFile samIn;
00025     // Open the file for reading.   
00026     assert(samIn.OpenForRead("testFiles/testVar.bam"));
00027 
00028     SamFile samOut;
00029     // Open the file for writing.   
00030     assert(samOut.OpenForWrite("results/updateVar.bam"));
00031 
00032     // Read the sam header.
00033     SamFileHeader samHeader;
00034     assert(samIn.ReadHeader(samHeader));
00035 
00036     assert(samOut.WriteHeader(samHeader));
00037 
00038     // Read the sam records.
00039     SamRecord samRecord;
00040 
00041     // Keep reading records until the end of the file is reached.
00042     while(samIn.ReadRecord(samHeader, samRecord))
00043     {
00044         // Successfully read a record from the file, so check to see
00045         // if it is valid.
00046         SamValidationErrors samValidationErrors;
00047         assert(SamValidator::isValid(samHeader, samRecord, 
00048                                      samValidationErrors));
00049       
00050         // Get the sequence.
00051         const char* sequence = samRecord.getSequence();
00052         assert(strcmp(sequence, "") != 0);
00053         std::string upSeq = sequence;
00054         upSeq[0] = 'N';
00055         assert(samRecord.setSequence(upSeq.c_str()));
00056       
00057         // write the sequence.
00058         assert(samOut.WriteRecord(samHeader, samRecord));
00059     }
00060 
00061     // Should have exited only when done reading.
00062     assert(samIn.GetStatus() == SamStatus::NO_MORE_RECS);
00063 }
00064 
00065 
00066 void modifyFirstBaseLong()
00067 {
00068     SamFile samIn;
00069     // Open the file for reading.
00070     assert(samIn.OpenForRead("statusTests/HalfG.bam"));
00071 
00072     SamFile samOut;
00073     // Open the file for writing.   
00074     assert(samOut.OpenForWrite("results/updateSeq.bam"));
00075 
00076     // Read the sam header.
00077     SamFileHeader samHeader;
00078     assert(samIn.ReadHeader(samHeader));
00079 
00080     assert(samOut.WriteHeader(samHeader));
00081 
00082     // Read the sam records.
00083     SamRecord samRecord;
00084 
00085     // Keep reading records until the end of the file is reached.
00086     while(samIn.ReadRecord(samHeader, samRecord))
00087     {
00088         // Successfully read a record from the file, so check to see
00089         // if it is valid.
00090         SamValidationErrors samValidationErrors;
00091         assert(SamValidator::isValid(samHeader, samRecord, 
00092                                      samValidationErrors));
00093       
00094         // Get the sequence.
00095         const char* sequence = samRecord.getSequence();
00096         assert(strcmp(sequence, "") != 0);
00097         std::string upSeq = sequence;
00098         upSeq[0] = 'N';
00099         assert(samRecord.setSequence(upSeq.c_str()));
00100       
00101         // write the sequence.
00102         assert(samOut.WriteRecord(samHeader, samRecord));
00103     }
00104 
00105     // Should have exited only when done reading.
00106     assert(samIn.GetStatus() == SamStatus::NO_MORE_RECS);
00107 }
00108 
00109 
00110 void testModifyVar()
00111 {
00112     modifyFirstBase();
00113     modifyVar modTest;
00114     modTest.testModifyVar("testFiles/testSam.sam", true);
00115     modTest.testModifyVar("testFiles/testBam.bam", true);
00116     modTest.testModifyVar("testFiles/testSam.sam", false);
00117     modTest.testModifyVar("testFiles/testBam.bam", false);
00118 }
00119 
00120 
00121 void modifyVar::testModifyVar(const char* filename, bool valBufFirst)
00122 {
00123     myFilename = filename;
00124     myValBufFirst = valBufFirst;
00125 
00126     testModifyReadNameOnlySameLength();
00127     testModifyCigarOnlySameLength();
00128     testModifySequenceOnlySameLength();
00129     testModifyQualityOnlySameLength();
00130     testRemoveQuality();
00131     testShortenQuality();
00132     testLengthenQuality();
00133 
00134     testShortenReadName();
00135     testShortenCigar();
00136     testShortenSequence();
00137 
00138     testLengthenReadName();
00139     testLengthenCigar();
00140     testLengthenSequence();
00141    
00142     testRemoveCigar();
00143     testRemoveSequence();
00144     testLengthenSequenceAndQuality();
00145 }
00146 
00147 
00148 void modifyVar::testModifyReadNameOnlySameLength()
00149 {
00150     resetExpected();
00151     openAndRead1Rec();
00152 
00153     // Set the Read Name - same length, just different name.
00154     expectedReadNameString = "1:1011:G:255+17M15D20M";
00155     samRecord.setReadName(expectedReadNameString.c_str());
00156 
00157     validate();
00158 }
00159 
00160 
00161 void modifyVar::testModifyCigarOnlySameLength()
00162 {
00163     resetExpected();
00164     openAndRead1Rec();
00165 
00166     // Set the Cigar - same length, just different values.
00167     expectedCigarString = "3M2I";
00168     samRecord.setCigar(expectedCigarString.c_str());
00169 
00170     // The new Cigar for record 1 is 3M2I
00171     // 3M = 3 << 4 | 0 = 0x30
00172     // 2I = 2 << 4 | 1 = 0x21
00173     expectedCigarBufLen = 2;
00174     expectedCigarBuffer[0] = 0x30;
00175     expectedCigarBuffer[1] = 0x21;
00176    
00177     validate();
00178 }
00179 
00180 
00181 void modifyVar::testModifySequenceOnlySameLength()
00182 {
00183     resetExpected();
00184     openAndRead1Rec();
00185 
00186     // Set the Sequence - same length, just different values.
00187     expectedSequenceString = "NCGAN";
00188     samRecord.setSequence(expectedSequenceString.c_str());
00189 
00190     // NCGAN = NC GA N = 0xF2 0x41 0xF0
00191     expectedSequenceBuffer[0] = 0xF2;
00192     expectedSequenceBuffer[1] = 0x41;
00193     expectedSequenceBuffer[2] = 0xF0;
00194 
00195     validate();
00196 }
00197 
00198 
00199 void modifyVar::testModifyQualityOnlySameLength()
00200 {
00201     resetExpected();
00202     openAndRead1Rec();
00203    
00204     // Set the Quality - same length, just different values.
00205     expectedQualityString = "!>6+!";
00206     samRecord.setQuality(expectedQualityString.c_str());
00207 
00208     validate();
00209 }
00210 
00211 
00212 void modifyVar::testRemoveQuality()
00213 {
00214     resetExpected();
00215     openAndRead1Rec();
00216    
00217     // Set the Quality - to "*" - does not affect the length since the 
00218     // sequence field drives the length.
00219     expectedQualityString = "*";
00220     samRecord.setQuality(expectedQualityString.c_str());
00221 
00222     validate();
00223 }
00224 
00225 
00226 void modifyVar::testShortenQuality()
00227 {
00228     resetExpected();
00229     openAndRead1Rec();
00230 
00231     // Set the Quality - shorten, but doesn't affect the length since
00232     // the sequence field drives the length.
00233     expectedQualityString = "!!";
00234     samRecord.setQuality(expectedQualityString.c_str());
00235 
00236     validate();
00237 }
00238 
00239 
00240 void modifyVar::testLengthenQuality()
00241 {
00242     resetExpected();
00243     openAndRead1Rec();
00244 
00245     // Set the Quality - lengthen, but doesn't affect the length since
00246     // the sequence field drives the length.
00247     expectedQualityString = "!!@@##";
00248     samRecord.setQuality(expectedQualityString.c_str());
00249 
00250     validate();
00251 }
00252 
00253 
00254 void modifyVar::testShortenReadName()
00255 {
00256     resetExpected();
00257     openAndRead1Rec();
00258 
00259     // Set the Read Name - shorter length
00260     expectedReadNameString = "1:1011:G:255";
00261     samRecord.setReadName(expectedReadNameString.c_str());
00262 
00263     validate();
00264 }
00265 
00266 
00267 void modifyVar::testShortenCigar()
00268 {
00269     resetExpected();
00270     openAndRead1Rec();
00271 
00272     // Set the Cigar - shorter length
00273     expectedCigarString = "5M";
00274     samRecord.setCigar(expectedCigarString.c_str());
00275 
00276     // The new Cigar for record 1 is 5M
00277     // 5M = 5 << 4 | 0 = 0x50
00278     expectedCigarBufLen = 1;
00279     expectedCigarBuffer[0] = 0x50;
00280    
00281     validate();
00282 }
00283 
00284 
00285 void modifyVar::testShortenSequence()
00286 {
00287     resetExpected();
00288     openAndRead1Rec();
00289 
00290     // Set the Sequence - shorter length.
00291     expectedSequenceString = "CCGA";
00292     samRecord.setSequence(expectedSequenceString.c_str());
00293 
00294     // CCGA = CC GA = 0x22 0x41
00295     expectedSequenceBuffer[0] = 0x22;
00296     expectedSequenceBuffer[1] = 0x41;
00297 
00298     validate();
00299 }
00300 
00301 
00302 void modifyVar::testLengthenReadName()
00303 {
00304     resetExpected();
00305     openAndRead1Rec();
00306 
00307     // Set the Read Name - longer.
00308     expectedReadNameString = "1:1011:G:255+17M15D20M:1111111";
00309     samRecord.setReadName(expectedReadNameString.c_str());
00310 
00311     validate();
00312 }
00313 
00314 
00315 void modifyVar::testLengthenCigar()
00316 {
00317     resetExpected();
00318     openAndRead1Rec();
00319 
00320     // Set the Cigar - longer length.
00321     expectedCigarString = "3M2D2I";
00322     samRecord.setCigar(expectedCigarString.c_str());
00323 
00324     // The new Cigar for record 1 is 3M2I
00325     // 3M = 3 << 4 | 0 = 0x30
00326     // 2D = 2 << 2 | 1 = 0x22
00327     // 2I = 2 << 4 | 1 = 0x21
00328     expectedCigarBufLen = 3;
00329     expectedCigarBuffer[0] = 0x30;
00330     expectedCigarBuffer[1] = 0x22;
00331     expectedCigarBuffer[2] = 0x21;
00332    
00333     validate();
00334 }
00335 
00336 
00337 void modifyVar::testLengthenSequence()
00338 {
00339     resetExpected();
00340     openAndRead1Rec();
00341 
00342     // Set the Sequence - longer length.
00343     expectedSequenceString = "CCGAATT";
00344     samRecord.setSequence(expectedSequenceString.c_str());
00345 
00346     // CCGAATT = CC GA AT T = 0x22 0x41 0x18 0x80
00347     expectedSequenceBuffer[0] = 0x22;
00348     expectedSequenceBuffer[1] = 0x41;
00349     expectedSequenceBuffer[2] = 0x18;
00350     expectedSequenceBuffer[3] = 0x80;
00351 
00352     validate();
00353 }
00354 
00355 
00356 void modifyVar::testRemoveCigar()
00357 {
00358     resetExpected();
00359     openAndRead1Rec();
00360 
00361     // Set the Cigar - same length, just different values.
00362     expectedCigarString = "*";
00363     expectedCigarBufLen = 0;
00364     samRecord.setCigar(expectedCigarString.c_str());
00365 
00366     validate();
00367 }
00368 
00369 
00370 void modifyVar::testRemoveSequence()
00371 {
00372     resetExpected();
00373     openAndRead1Rec();
00374 
00375     // Set the Sequence - shorter length.
00376     expectedSequenceString = "*";
00377     samRecord.setSequence(expectedSequenceString.c_str());
00378 
00379     validate();
00380 }
00381 
00382 
00383 void modifyVar::testLengthenSequenceAndQuality()
00384 {
00385     resetExpected();
00386     openAndRead1Rec();
00387 
00388     // Set the Sequence & quality - longer.
00389     expectedSequenceString = "CCGAATT";
00390     expectedQualityString = "!@#$%^&";
00391     samRecord.setSequence(expectedSequenceString.c_str());
00392     samRecord.setQuality(expectedQualityString.c_str());
00393 
00394     // CCGAATT = CC GA AT T = 0x22 0x41 0x18 0x80
00395     expectedSequenceBuffer[0] = 0x22;
00396     expectedSequenceBuffer[1] = 0x41;
00397     expectedSequenceBuffer[2] = 0x18;
00398     expectedSequenceBuffer[3] = 0x80;
00399 
00400     validate();
00401 }
00402 
00403 
00404 void modifyVar::validate()
00405 {
00406     if(myValBufFirst)
00407     {
00408         // get the record.
00409         const bamRecordStruct* recordBuffer = 
00410             (const bamRecordStruct*)samRecord.getRecordBuffer();
00411       
00412         // Validate the buffer.
00413         validateReadName(recordBuffer);
00414         validateCigar(recordBuffer);
00415         validateSequence(recordBuffer);
00416         validateQuality(recordBuffer);
00417         validateTags(recordBuffer);
00418       
00419         // Validate the strings.
00420         validateReadNameString();
00421         validateCigarString();
00422         validateSequenceString();
00423         validateQualityString();
00424         validateTagsString();
00425     }
00426     else
00427     {
00428         // get the record.
00429         const bamRecordStruct* recordBuffer = 
00430             (const bamRecordStruct*)samRecord.getRecordBuffer();
00431       
00432         // Validate the buffer.
00433         validateReadName(recordBuffer);
00434         validateCigar(recordBuffer);
00435         validateSequence(recordBuffer);
00436         validateQuality(recordBuffer);
00437         validateTags(recordBuffer);
00438       
00439         // Validate the strings.
00440         validateReadNameString();
00441         validateCigarString();
00442         validateSequenceString();
00443         validateQualityString();
00444         validateTagsString();
00445     }
00446 }
00447 
00448 void modifyVar::validateReadName(const bamRecordStruct* recordBuffer)
00449 {
00450     const char* varPtr = (const char*)&(recordBuffer->myData);
00451 
00452     unsigned int len = expectedReadNameString.length();
00453     for(unsigned int i = 0; i < len; i++)
00454     {
00455         assert(varPtr[i] == expectedReadNameString[i]);
00456     }
00457 
00458     // Verify ending null.
00459     assert(varPtr[len] == 0);
00460    
00461     // verify the length - add one for the terminating null.
00462     assert(recordBuffer->myReadNameLength == 
00463            expectedReadNameString.length() + 1);
00464 }
00465 
00466 
00467 void modifyVar::validateCigar(const bamRecordStruct* recordBuffer)
00468 {
00469     const unsigned char* cigarStart = 
00470         (const unsigned char*)&(recordBuffer->myData) 
00471         + recordBuffer->myReadNameLength;
00472 
00473     unsigned int* varPtr = (unsigned int*)cigarStart;
00474    
00475     for(int i = 0; i < expectedCigarBufLen; i++)
00476     {
00477         assert(varPtr[i] == expectedCigarBuffer[i]);
00478     }
00479    
00480     assert(recordBuffer->myCigarLength == expectedCigarBufLen);
00481 }
00482 
00483 
00484 void modifyVar::validateSequence(const bamRecordStruct* recordBuffer)
00485 {
00486     // Calculate the sequence length.
00487     int expectedReadLen = expectedSequenceString.length();
00488     int seqLen = (expectedReadLen + 1)/2;
00489     if(expectedSequenceString == "*")
00490     {
00491         expectedReadLen = 0;
00492         seqLen = 0;
00493     }
00494    
00495     const unsigned char* sequencePtr = 
00496         (const unsigned char*)&(recordBuffer->myData) 
00497         + recordBuffer->myReadNameLength + (recordBuffer->myCigarLength * 4);
00498 
00499     for(int i = 0; i < seqLen; i++)
00500     {
00501         assert(sequencePtr[i] == expectedSequenceBuffer[i]);
00502     }
00503 
00504     assert(recordBuffer->myReadLength == expectedReadLen);
00505 }
00506 
00507 
00508 void modifyVar::validateQuality(const bamRecordStruct* recordBuffer)
00509 {
00510     int expectedReadLen = expectedSequenceString.length();
00511     int seqLen = (expectedReadLen + 1)/2;
00512     if(expectedSequenceString == "*")
00513     {
00514         expectedReadLen = 0;
00515         seqLen = 0;
00516     }
00517 
00518     const uint8_t* qualityPtr = 
00519         (const unsigned char*)&(recordBuffer->myData) 
00520         + recordBuffer->myReadNameLength + (recordBuffer->myCigarLength * 4)
00521         + seqLen;
00522  
00523     int qualityLen = expectedQualityString.length();
00524 
00525     for(int i = 0; i < expectedReadLen; i++)
00526     {
00527         if(expectedQualityString == "*")
00528         {
00529             // no quality, so check for 0xFF.
00530             assert(qualityPtr[i] == 0xFF);
00531         }
00532         else if(i >= qualityLen)
00533         {
00534             // Quality is shorter than the sequence, so should be padded with
00535             // 0xFF.
00536             assert(qualityPtr[i] == 0xFF);
00537         }
00538         else
00539         {
00540             assert(qualityPtr[i] == (expectedQualityString[i] - 33));
00541         }
00542     }
00543 
00544     assert(recordBuffer->myReadLength == expectedReadLen);
00545 }
00546 
00547 
00548 void modifyVar::validateTags(const bamRecordStruct* recordBuffer)
00549 {
00550     const unsigned char* tagsPtr = 
00551         (const unsigned char*)&(recordBuffer->myData) 
00552         + recordBuffer->myReadNameLength + (recordBuffer->myCigarLength * 4)
00553         + (recordBuffer->myReadLength + 1)/2 + recordBuffer->myReadLength;
00554  
00555     for(int i = 0; i < expectedTagsLen; i++)
00556     {
00557         assert(tagsPtr[i] == expectedTagsBuffer[i]);
00558     }
00559 
00560     // Calculate expected block size - from the start of the buffer to the
00561     // start of the tags plus the tags length - minus the size of the blocksize
00562     // field.
00563     int32_t expectedBlockSize = tagsPtr - (const unsigned char*)(recordBuffer) 
00564         + expectedTagsLen - 4;
00565     assert(recordBuffer->myBlockSize == expectedBlockSize);
00566 }
00567 
00568 
00569 void modifyVar::validateTagsString()
00570 {
00571     char tag[3];
00572     char type;
00573     void* value;
00574     assert(samRecord.getNextSamTag(tag, type, &value) == true);
00575     assert(tag[0] == 'A');
00576     assert(tag[1] == 'M');
00577     assert(type == 'i');
00578     assert(*(char*)value == 0);
00579     assert(samRecord.getNextSamTag(tag, type, &value) == true);
00580     assert(tag[0] == 'M');
00581     assert(tag[1] == 'D');
00582     assert(type == 'Z');
00583     assert(*(String*)value == "37");
00584     assert(samRecord.getNextSamTag(tag, type, &value) == true);
00585     assert(tag[0] == 'N');
00586     assert(tag[1] == 'M');
00587     assert(type == 'i');
00588     assert(*(char*)value == 0);
00589     assert(samRecord.getNextSamTag(tag, type, &value) == true);
00590     assert(tag[0] == 'X');
00591     assert(tag[1] == 'T');
00592     assert(type == 'A');
00593     assert(*(char*)value == 'R');
00594     // No more tags, should return false.
00595     assert(samRecord.getNextSamTag(tag, type, &value) == false);
00596     assert(samRecord.getNextSamTag(tag, type, &value) == false);   
00597 }
00598 
00599 void modifyVar::validateReadNameString()
00600 {
00601     assert(samRecord.getReadName() == expectedReadNameString);
00602 }
00603 
00604 void modifyVar::validateCigarString()
00605 {
00606     assert(samRecord.getCigar() == expectedCigarString);
00607 }
00608 
00609 void modifyVar::validateSequenceString()
00610 {
00611     assert(samRecord.getSequence() == expectedSequenceString);
00612 }
00613 
00614 void modifyVar::validateQualityString()
00615 {
00616     assert(samRecord.getQuality() == expectedQualityString);
00617 }
00618 
00619 
00620 void modifyVar::resetExpected()
00621 {
00622     expectedReadNameString = "1:1011:F:255+17M15D20M";
00623     expectedCigarString = "5M2D";
00624     expectedSequenceString = "CCGAA";
00625     expectedQualityString = "6>6+4";
00626 
00627     // The default Cigar for record 1 is 5M2D
00628     // 5M = 5 << 4 | 0 = 0x50
00629     // 2D = 2 << 4 | 2 = 0x22
00630     expectedCigarBufLen = 2;
00631     expectedCigarBuffer[0] = 0x50;
00632     expectedCigarBuffer[1] = 0x22;
00633 
00634     // CCGAA = CC GA A = 0x22 0x41 0x10
00635     expectedSequenceBuffer[0] = 0x22;
00636     expectedSequenceBuffer[1] = 0x41;
00637     expectedSequenceBuffer[2] = 0x10;
00638 
00639     expectedTagsLen = 18;
00640     expectedTagsBuffer[0] = 'A';
00641     expectedTagsBuffer[1] =  'M';
00642     expectedTagsBuffer[2] =  'C';
00643     expectedTagsBuffer[3] =  0;
00644     expectedTagsBuffer[4] =  'M';
00645     expectedTagsBuffer[5] =  'D';
00646     expectedTagsBuffer[6] =  'Z';
00647     expectedTagsBuffer[7] =  '3';
00648     expectedTagsBuffer[8] =  '7';
00649     expectedTagsBuffer[9] =  0;
00650     expectedTagsBuffer[10] =  'N';
00651     expectedTagsBuffer[11] =  'M';
00652     expectedTagsBuffer[12] =  'C';
00653     expectedTagsBuffer[13] =  0;
00654     expectedTagsBuffer[14] =  'X';
00655     expectedTagsBuffer[15] =  'T';
00656     expectedTagsBuffer[16] =  'A';
00657     expectedTagsBuffer[17] =  'R';
00658 }
00659 
00660 
00661 void modifyVar::openAndRead1Rec()
00662 {
00663     // Open the file for reading.   
00664     assert(samIn.OpenForRead(myFilename));
00665 
00666     // Read the sam header.
00667     assert(samIn.ReadHeader(samHeader));
00668    
00669     // Read the first record.   
00670     assert(samIn.ReadRecord(samHeader, samRecord));
00671 }
Generated on Wed Nov 17 15:38:27 2010 for StatGen Software by  doxygen 1.6.3