libStatGen Software  1
ModifyVar.cpp
00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include "SamFile.h"
00019 #include "ModifyVar.h"
00020 #include "SamValidation.h"
00021 
00022 void modifyFirstBase()
00023 {
00024     SamFile samIn;
00025     // Open the file for reading.   
00026     assert(samIn.OpenForRead("testFiles/testVar.bam"));
00027 
00028     SamFile samOut;
00029     // Open the file for writing.   
00030     assert(samOut.OpenForWrite("results/updateVar.bam"));
00031 
00032     // Read the sam header.
00033     SamFileHeader samHeader;
00034     assert(samIn.ReadHeader(samHeader));
00035 
00036     assert(samOut.WriteHeader(samHeader));
00037 
00038     // Read the sam records.
00039     SamRecord samRecord;
00040 
00041     // Keep reading records until the end of the file is reached.
00042     while(samIn.ReadRecord(samHeader, samRecord))
00043     {
00044         // Successfully read a record from the file, so check to see
00045         // if it is valid.
00046         SamValidationErrors samValidationErrors;
00047         assert(SamValidator::isValid(samHeader, samRecord, 
00048                                      samValidationErrors));
00049       
00050         // Get the sequence.
00051         const char* sequence = samRecord.getSequence();
00052         assert(strcmp(sequence, "") != 0);
00053         std::string upSeq = sequence;
00054         upSeq[0] = 'N';
00055         assert(samRecord.setSequence(upSeq.c_str()));
00056       
00057         // write the sequence.
00058         assert(samOut.WriteRecord(samHeader, samRecord));
00059     }
00060 
00061     // Should have exited only when done reading.
00062     assert(samIn.GetStatus() == SamStatus::NO_MORE_RECS);
00063 }
00064 
00065 
00066 void modifyFirstBaseLong()
00067 {
00068     SamFile samIn;
00069     // Open the file for reading.
00070     assert(samIn.OpenForRead("statusTests/HalfG.bam"));
00071 
00072     SamFile samOut;
00073     // Open the file for writing.   
00074     assert(samOut.OpenForWrite("results/updateSeq.bam"));
00075 
00076     // Read the sam header.
00077     SamFileHeader samHeader;
00078     assert(samIn.ReadHeader(samHeader));
00079 
00080     assert(samOut.WriteHeader(samHeader));
00081 
00082     // Read the sam records.
00083     SamRecord samRecord;
00084 
00085     // Keep reading records until the end of the file is reached.
00086     while(samIn.ReadRecord(samHeader, samRecord))
00087     {
00088         // Successfully read a record from the file, so check to see
00089         // if it is valid.
00090         SamValidationErrors samValidationErrors;
00091         assert(SamValidator::isValid(samHeader, samRecord, 
00092                                      samValidationErrors));
00093       
00094         // Get the sequence.
00095         const char* sequence = samRecord.getSequence();
00096         assert(strcmp(sequence, "") != 0);
00097         std::string upSeq = sequence;
00098         upSeq[0] = 'N';
00099         assert(samRecord.setSequence(upSeq.c_str()));
00100       
00101         // write the sequence.
00102         assert(samOut.WriteRecord(samHeader, samRecord));
00103     }
00104 
00105     // Should have exited only when done reading.
00106     assert(samIn.GetStatus() == SamStatus::NO_MORE_RECS);
00107 }
00108 
00109 
00110 void testModifyVar()
00111 {
00112 #ifdef __ZLIB_AVAILABLE__
00113     modifyFirstBase();
00114 #endif
00115     modifyVar modTest;
00116     modTest.testModifyVar("testFiles/testSam.sam", true);
00117     modTest.testModifyVar("testFiles/testSam.sam", false);
00118 #ifdef __ZLIB_AVAILABLE__
00119     modTest.testModifyVar("testFiles/testBam.bam", true);
00120     modTest.testModifyVar("testFiles/testBam.bam", false);
00121 #endif
00122 }
00123 
00124 
00125 void modifyVar::testModifyVar(const char* filename, bool valBufFirst)
00126 {
00127     myFilename = filename;
00128     myValBufFirst = valBufFirst;
00129 
00130     testModifyReadNameOnlySameLength();
00131     testModifyCigarOnlySameLength();
00132     testModifySequenceOnlySameLength();
00133     testModifyQualityOnlySameLength();
00134     testRemoveQuality();
00135     testShortenQuality();
00136     testLengthenQuality();
00137 
00138     testShortenReadName();
00139     testShortenCigar();
00140     testShortenSequence();
00141 
00142     testLengthenReadName();
00143     testLengthenCigar();
00144     testLengthenSequence();
00145    
00146     testRemoveCigar();
00147     testRemoveSequence();
00148     testLengthenSequenceAndQuality();
00149 }
00150 
00151 
00152 void modifyVar::testModifyReadNameOnlySameLength()
00153 {
00154     resetExpected();
00155     openAndRead1Rec();
00156 
00157     // Set the Read Name - same length, just different name.
00158     expectedReadNameString = "1:1011:G:255+17M15D20M";
00159     samRecord.setReadName(expectedReadNameString.c_str());
00160 
00161     validate();
00162 }
00163 
00164 
00165 void modifyVar::testModifyCigarOnlySameLength()
00166 {
00167     resetExpected();
00168     openAndRead1Rec();
00169 
00170     // Set the Cigar - same length, just different values.
00171     expectedCigarString = "3M2I";
00172     samRecord.setCigar(expectedCigarString.c_str());
00173 
00174     // The new Cigar for record 1 is 3M2I
00175     // 3M = 3 << 4 | 0 = 0x30
00176     // 2I = 2 << 4 | 1 = 0x21
00177     expectedCigarBufLen = 2;
00178     expectedCigarBuffer[0] = 0x30;
00179     expectedCigarBuffer[1] = 0x21;
00180    
00181     validate();
00182 }
00183 
00184 
00185 void modifyVar::testModifySequenceOnlySameLength()
00186 {
00187     resetExpected();
00188     openAndRead1Rec();
00189 
00190     // Set the Sequence - same length, just different values.
00191     expectedSequenceString = "NCGAN";
00192     samRecord.setSequence(expectedSequenceString.c_str());
00193 
00194     // NCGAN = NC GA N = 0xF2 0x41 0xF0
00195     expectedSequenceBuffer[0] = 0xF2;
00196     expectedSequenceBuffer[1] = 0x41;
00197     expectedSequenceBuffer[2] = 0xF0;
00198 
00199     validate();
00200 }
00201 
00202 
00203 void modifyVar::testModifyQualityOnlySameLength()
00204 {
00205     resetExpected();
00206     openAndRead1Rec();
00207    
00208     // Set the Quality - same length, just different values.
00209     expectedQualityString = "!>6+!";
00210     samRecord.setQuality(expectedQualityString.c_str());
00211 
00212     validate();
00213 }
00214 
00215 
00216 void modifyVar::testRemoveQuality()
00217 {
00218     resetExpected();
00219     openAndRead1Rec();
00220    
00221     // Set the Quality - to "*" - does not affect the length since the 
00222     // sequence field drives the length.
00223     expectedQualityString = "*";
00224     samRecord.setQuality(expectedQualityString.c_str());
00225 
00226     validate();
00227 }
00228 
00229 
00230 void modifyVar::testShortenQuality()
00231 {
00232     resetExpected();
00233     openAndRead1Rec();
00234 
00235     // Set the Quality - shorten, but doesn't affect the length since
00236     // the sequence field drives the length.
00237     expectedQualityString = "!!";
00238     samRecord.setQuality(expectedQualityString.c_str());
00239 
00240     validate();
00241 }
00242 
00243 
00244 void modifyVar::testLengthenQuality()
00245 {
00246     resetExpected();
00247     openAndRead1Rec();
00248 
00249     // Set the Quality - lengthen, but doesn't affect the length since
00250     // the sequence field drives the length.
00251     expectedQualityString = "!!@@##";
00252     samRecord.setQuality(expectedQualityString.c_str());
00253 
00254     validate();
00255 }
00256 
00257 
00258 void modifyVar::testShortenReadName()
00259 {
00260     resetExpected();
00261     openAndRead1Rec();
00262 
00263     // Set the Read Name - shorter length
00264     expectedReadNameString = "1:1011:G:255";
00265     samRecord.setReadName(expectedReadNameString.c_str());
00266 
00267     validate();
00268 }
00269 
00270 
00271 void modifyVar::testShortenCigar()
00272 {
00273     resetExpected();
00274     openAndRead1Rec();
00275 
00276     // Set the Cigar - shorter length
00277     expectedCigarString = "5M";
00278     samRecord.setCigar(expectedCigarString.c_str());
00279 
00280     // The new Cigar for record 1 is 5M
00281     // 5M = 5 << 4 | 0 = 0x50
00282     expectedCigarBufLen = 1;
00283     expectedCigarBuffer[0] = 0x50;
00284    
00285     validate();
00286 }
00287 
00288 
00289 void modifyVar::testShortenSequence()
00290 {
00291     resetExpected();
00292     openAndRead1Rec();
00293 
00294     // Set the Sequence - shorter length.
00295     expectedSequenceString = "CCGA";
00296     samRecord.setSequence(expectedSequenceString.c_str());
00297 
00298     // CCGA = CC GA = 0x22 0x41
00299     expectedSequenceBuffer[0] = 0x22;
00300     expectedSequenceBuffer[1] = 0x41;
00301 
00302     validate();
00303 }
00304 
00305 
00306 void modifyVar::testLengthenReadName()
00307 {
00308     resetExpected();
00309     openAndRead1Rec();
00310 
00311     // Set the Read Name - longer.
00312     expectedReadNameString = "1:1011:G:255+17M15D20M:1111111";
00313     samRecord.setReadName(expectedReadNameString.c_str());
00314 
00315     validate();
00316 }
00317 
00318 
00319 void modifyVar::testLengthenCigar()
00320 {
00321     resetExpected();
00322     openAndRead1Rec();
00323 
00324     // Set the Cigar - longer length.
00325     expectedCigarString = "3M2D2I";
00326     samRecord.setCigar(expectedCigarString.c_str());
00327 
00328     // The new Cigar for record 1 is 3M2I
00329     // 3M = 3 << 4 | 0 = 0x30
00330     // 2D = 2 << 2 | 1 = 0x22
00331     // 2I = 2 << 4 | 1 = 0x21
00332     expectedCigarBufLen = 3;
00333     expectedCigarBuffer[0] = 0x30;
00334     expectedCigarBuffer[1] = 0x22;
00335     expectedCigarBuffer[2] = 0x21;
00336    
00337     validate();
00338 }
00339 
00340 
00341 void modifyVar::testLengthenSequence()
00342 {
00343     resetExpected();
00344     openAndRead1Rec();
00345 
00346     // Set the Sequence - longer length.
00347     expectedSequenceString = "CCGAATT";
00348     samRecord.setSequence(expectedSequenceString.c_str());
00349 
00350     // CCGAATT = CC GA AT T = 0x22 0x41 0x18 0x80
00351     expectedSequenceBuffer[0] = 0x22;
00352     expectedSequenceBuffer[1] = 0x41;
00353     expectedSequenceBuffer[2] = 0x18;
00354     expectedSequenceBuffer[3] = 0x80;
00355 
00356     validate();
00357 }
00358 
00359 
00360 void modifyVar::testRemoveCigar()
00361 {
00362     resetExpected();
00363     openAndRead1Rec();
00364 
00365     // Set the Cigar - same length, just different values.
00366     expectedCigarString = "*";
00367     expectedCigarBufLen = 0;
00368     samRecord.setCigar(expectedCigarString.c_str());
00369 
00370     validate();
00371 }
00372 
00373 
00374 void modifyVar::testRemoveSequence()
00375 {
00376     resetExpected();
00377     openAndRead1Rec();
00378 
00379     // Set the Sequence - shorter length.
00380     expectedSequenceString = "*";
00381     samRecord.setSequence(expectedSequenceString.c_str());
00382 
00383     validate();
00384 }
00385 
00386 
00387 void modifyVar::testLengthenSequenceAndQuality()
00388 {
00389     resetExpected();
00390     openAndRead1Rec();
00391 
00392     // Set the Sequence & quality - longer.
00393     expectedSequenceString = "CCGAATT";
00394     expectedQualityString = "!@#$%^&";
00395     samRecord.setSequence(expectedSequenceString.c_str());
00396     samRecord.setQuality(expectedQualityString.c_str());
00397 
00398     // CCGAATT = CC GA AT T = 0x22 0x41 0x18 0x80
00399     expectedSequenceBuffer[0] = 0x22;
00400     expectedSequenceBuffer[1] = 0x41;
00401     expectedSequenceBuffer[2] = 0x18;
00402     expectedSequenceBuffer[3] = 0x80;
00403 
00404     validate();
00405 }
00406 
00407 
00408 void modifyVar::validate()
00409 {
00410     if(myValBufFirst)
00411     {
00412         // get the record.
00413         const bamRecordStruct* recordBuffer = 
00414             (const bamRecordStruct*)samRecord.getRecordBuffer();
00415       
00416         // Validate the buffer.
00417         validateReadName(recordBuffer);
00418         validateCigar(recordBuffer);
00419         validateSequence(recordBuffer);
00420         validateQuality(recordBuffer);
00421         validateTags(recordBuffer);
00422       
00423         // Validate the strings.
00424         validateReadNameString();
00425         validateCigarString();
00426         validateSequenceString();
00427         validateQualityString();
00428         validateTagsString();
00429     }
00430     else
00431     {
00432         // get the record.
00433         const bamRecordStruct* recordBuffer = 
00434             (const bamRecordStruct*)samRecord.getRecordBuffer();
00435       
00436         // Validate the buffer.
00437         validateReadName(recordBuffer);
00438         validateCigar(recordBuffer);
00439         validateSequence(recordBuffer);
00440         validateQuality(recordBuffer);
00441         validateTags(recordBuffer);
00442       
00443         // Validate the strings.
00444         validateReadNameString();
00445         validateCigarString();
00446         validateSequenceString();
00447         validateQualityString();
00448         validateTagsString();
00449     }
00450 }
00451 
00452 void modifyVar::validateReadName(const bamRecordStruct* recordBuffer)
00453 {
00454     const char* varPtr = (const char*)&(recordBuffer->myData);
00455 
00456     unsigned int len = expectedReadNameString.length();
00457     for(unsigned int i = 0; i < len; i++)
00458     {
00459         assert(varPtr[i] == expectedReadNameString[i]);
00460     }
00461 
00462     // Verify ending null.
00463     assert(varPtr[len] == 0);
00464    
00465     // verify the length - add one for the terminating null.
00466     assert(recordBuffer->myReadNameLength == 
00467            expectedReadNameString.length() + 1);
00468 }
00469 
00470 
00471 void modifyVar::validateCigar(const bamRecordStruct* recordBuffer)
00472 {
00473     const unsigned char* cigarStart = 
00474         (const unsigned char*)&(recordBuffer->myData) 
00475         + recordBuffer->myReadNameLength;
00476 
00477     unsigned int* varPtr = (unsigned int*)cigarStart;
00478    
00479     for(int i = 0; i < expectedCigarBufLen; i++)
00480     {
00481         assert(varPtr[i] == expectedCigarBuffer[i]);
00482     }
00483    
00484     assert(recordBuffer->myCigarLength == expectedCigarBufLen);
00485 }
00486 
00487 
00488 void modifyVar::validateSequence(const bamRecordStruct* recordBuffer)
00489 {
00490     // Calculate the sequence length.
00491     int expectedReadLen = expectedSequenceString.length();
00492     int seqLen = (expectedReadLen + 1)/2;
00493     if(expectedSequenceString == "*")
00494     {
00495         expectedReadLen = 0;
00496         seqLen = 0;
00497     }
00498    
00499     const unsigned char* sequencePtr = 
00500         (const unsigned char*)&(recordBuffer->myData) 
00501         + recordBuffer->myReadNameLength + (recordBuffer->myCigarLength * 4);
00502 
00503     for(int i = 0; i < seqLen; i++)
00504     {
00505         assert(sequencePtr[i] == expectedSequenceBuffer[i]);
00506     }
00507 
00508     assert(recordBuffer->myReadLength == expectedReadLen);
00509 }
00510 
00511 
00512 void modifyVar::validateQuality(const bamRecordStruct* recordBuffer)
00513 {
00514     int expectedReadLen = expectedSequenceString.length();
00515     int seqLen = (expectedReadLen + 1)/2;
00516     if(expectedSequenceString == "*")
00517     {
00518         expectedReadLen = 0;
00519         seqLen = 0;
00520     }
00521 
00522     const uint8_t* qualityPtr = 
00523         (const unsigned char*)&(recordBuffer->myData) 
00524         + recordBuffer->myReadNameLength + (recordBuffer->myCigarLength * 4)
00525         + seqLen;
00526  
00527     int qualityLen = expectedQualityString.length();
00528 
00529     for(int i = 0; i < expectedReadLen; i++)
00530     {
00531         if(expectedQualityString == "*")
00532         {
00533             // no quality, so check for 0xFF.
00534             assert(qualityPtr[i] == 0xFF);
00535         }
00536         else if(i >= qualityLen)
00537         {
00538             // Quality is shorter than the sequence, so should be padded with
00539             // 0xFF.
00540             assert(qualityPtr[i] == 0xFF);
00541         }
00542         else
00543         {
00544             assert(qualityPtr[i] == (expectedQualityString[i] - 33));
00545         }
00546     }
00547 
00548     assert(recordBuffer->myReadLength == expectedReadLen);
00549 }
00550 
00551 
00552 void modifyVar::validateTags(const bamRecordStruct* recordBuffer)
00553 {
00554     const unsigned char* tagsPtr = 
00555         (const unsigned char*)&(recordBuffer->myData) 
00556         + recordBuffer->myReadNameLength + (recordBuffer->myCigarLength * 4)
00557         + (recordBuffer->myReadLength + 1)/2 + recordBuffer->myReadLength;
00558  
00559     for(int i = 0; i < expectedTagsLen; i++)
00560     {
00561         assert(tagsPtr[i] == expectedTagsBuffer[i]);
00562     }
00563 
00564     // Calculate expected block size - from the start of the buffer to the
00565     // start of the tags plus the tags length - minus the size of the blocksize
00566     // field.
00567     int32_t expectedBlockSize = tagsPtr - (const unsigned char*)(recordBuffer) 
00568         + expectedTagsLen - 4;
00569     assert(recordBuffer->myBlockSize == expectedBlockSize);
00570 }
00571 
00572 
00573 void modifyVar::validateTagsString()
00574 {
00575     char tag[3];
00576     char type;
00577     void* value;
00578     assert(samRecord.getNextSamTag(tag, type, &value) == true);
00579     assert(tag[0] == 'A');
00580     assert(tag[1] == 'M');
00581     assert(type == 'i');
00582     assert(*(char*)value == 0);
00583     assert(samRecord.getNextSamTag(tag, type, &value) == true);
00584     assert(tag[0] == 'M');
00585     assert(tag[1] == 'D');
00586     assert(type == 'Z');
00587     assert(*(String*)value == "37");
00588     assert(samRecord.getNextSamTag(tag, type, &value) == true);
00589     assert(tag[0] == 'N');
00590     assert(tag[1] == 'M');
00591     assert(type == 'i');
00592     assert(*(char*)value == 0);
00593     assert(samRecord.getNextSamTag(tag, type, &value) == true);
00594     assert(tag[0] == 'X');
00595     assert(tag[1] == 'T');
00596     assert(type == 'A');
00597     assert(*(char*)value == 'R');
00598     // No more tags, should return false.
00599     assert(samRecord.getNextSamTag(tag, type, &value) == false);
00600     assert(samRecord.getNextSamTag(tag, type, &value) == false);   
00601 }
00602 
00603 void modifyVar::validateReadNameString()
00604 {
00605     assert(samRecord.getReadName() == expectedReadNameString);
00606 }
00607 
00608 void modifyVar::validateCigarString()
00609 {
00610     assert(samRecord.getCigar() == expectedCigarString);
00611 }
00612 
00613 void modifyVar::validateSequenceString()
00614 {
00615     assert(samRecord.getSequence() == expectedSequenceString);
00616 }
00617 
00618 void modifyVar::validateQualityString()
00619 {
00620     assert(samRecord.getQuality() == expectedQualityString);
00621 }
00622 
00623 
00624 void modifyVar::resetExpected()
00625 {
00626     expectedReadNameString = "1:1011:F:255+17M15D20M";
00627     expectedCigarString = "5M2D";
00628     expectedSequenceString = "CCGAA";
00629     expectedQualityString = "6>6+4";
00630 
00631     // The default Cigar for record 1 is 5M2D
00632     // 5M = 5 << 4 | 0 = 0x50
00633     // 2D = 2 << 4 | 2 = 0x22
00634     expectedCigarBufLen = 2;
00635     expectedCigarBuffer[0] = 0x50;
00636     expectedCigarBuffer[1] = 0x22;
00637 
00638     // CCGAA = CC GA A = 0x22 0x41 0x10
00639     expectedSequenceBuffer[0] = 0x22;
00640     expectedSequenceBuffer[1] = 0x41;
00641     expectedSequenceBuffer[2] = 0x10;
00642 
00643     expectedTagsLen = 18;
00644     expectedTagsBuffer[0] = 'A';
00645     expectedTagsBuffer[1] =  'M';
00646     expectedTagsBuffer[2] =  'C';
00647     expectedTagsBuffer[3] =  0;
00648     expectedTagsBuffer[4] =  'M';
00649     expectedTagsBuffer[5] =  'D';
00650     expectedTagsBuffer[6] =  'Z';
00651     expectedTagsBuffer[7] =  '3';
00652     expectedTagsBuffer[8] =  '7';
00653     expectedTagsBuffer[9] =  0;
00654     expectedTagsBuffer[10] =  'N';
00655     expectedTagsBuffer[11] =  'M';
00656     expectedTagsBuffer[12] =  'C';
00657     expectedTagsBuffer[13] =  0;
00658     expectedTagsBuffer[14] =  'X';
00659     expectedTagsBuffer[15] =  'T';
00660     expectedTagsBuffer[16] =  'A';
00661     expectedTagsBuffer[17] =  'R';
00662 }
00663 
00664 
00665 void modifyVar::openAndRead1Rec()
00666 {
00667     // Open the file for reading.   
00668     assert(samIn.OpenForRead(myFilename));
00669 
00670     // Read the sam header.
00671     assert(samIn.ReadHeader(samHeader));
00672    
00673     // Read the first record.   
00674     assert(samIn.ReadRecord(samHeader, samRecord));
00675 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends