GenomeSequence_test.cpp

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include <gtest/gtest.h>
00019 
00020 #include "GenomeSequence.h"
00021 
00022 TEST(GenomeSequenceTest, staticLookupTest)
00023 {
00024     GenomeSequence s;
00025     // quick sanity check...
00026     EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'A']], 'A');
00027     EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'a']], 'A');
00028     EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'T']], 'T');
00029     EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 't']], 'T');
00030     EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'C']], 'C');
00031     EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'c']], 'C');
00032     EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'G']], 'G');
00033     EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'g']], 'G');
00034     EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'N']], 'N');
00035     EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'n']], 'N');
00036     EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'M']], 'M');
00037     EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'm']], 'M');
00038 
00039     EXPECT_EQ(GenomeSequence::base2int[(int) 'N'], 4);
00040     EXPECT_EQ(GenomeSequence::base2int[(int) 'n'], 4);
00041     EXPECT_EQ(GenomeSequence::base2int[(int) 'A'], 0);
00042     EXPECT_EQ(GenomeSequence::base2int[(int) 'a'], 0);
00043     EXPECT_EQ(GenomeSequence::base2int[(int) 'T'], 3);
00044     EXPECT_EQ(GenomeSequence::base2int[(int) 't'], 3);
00045     EXPECT_EQ(GenomeSequence::base2int[(int) 'C'], 1);
00046     EXPECT_EQ(GenomeSequence::base2int[(int) 'c'], 1);
00047     EXPECT_EQ(GenomeSequence::base2int[(int) 'G'], 2);
00048     EXPECT_EQ(GenomeSequence::base2int[(int) 'g'], 2);
00049 }
00050 
00051 TEST(GenomeSequenceTest, testBaseSpaceReference)
00052 {
00053     GenomeSequence s;
00054     int exitCode = system("rm -f ../../testdata/phiX-bs.umfa");
00055     EXPECT_EQ(exitCode, 0);
00056 
00057     s.setReferenceName("../../testdata/phiX.fa");
00058     bool rc = s.create(false);
00059     EXPECT_EQ(rc, false);
00060     EXPECT_EQ(s[0], 'G');
00061     EXPECT_EQ(s[1], 'A');
00062     EXPECT_EQ(s[2], 'G');
00063     EXPECT_EQ(s[s.getNumberBases()-3], 'G');
00064     EXPECT_EQ(s[s.getNumberBases()-2], 'C');
00065     EXPECT_EQ(s[s.getNumberBases()-1], 'A');
00066     EXPECT_EQ(s[s.getNumberBases()], 'N');  // check bounds checker
00067 
00068     s.close();
00069 }
00070 
00071 TEST(GenomeSequenceTest, testColorSpaceReference)
00072 {
00073     GenomeSequence s;
00074     int exitCode = system("rm -f ../../testdata/phiX-cs.umfa");
00075     EXPECT_EQ(exitCode, 0);
00076 
00077     s.setReferenceName("../../testdata/phiX.fa");
00078     bool rc = s.create(true);
00079 
00080     // NB: I did not calculate these expected values, I just
00081     // read them from the converted genome and set them here.
00082     // So in theory, they should be checked by hand to ensure
00083     // that they are correct.
00084     EXPECT_EQ(rc, false);
00085     EXPECT_EQ(s[0], 'N');   // in color space, first symbol is unknown
00086     EXPECT_EQ(s[1], '2');
00087     EXPECT_EQ(s[2], '2');
00088     EXPECT_EQ(s[s.getNumberBases()-3], '1');
00089     EXPECT_EQ(s[s.getNumberBases()-2], '3');
00090     EXPECT_EQ(s[s.getNumberBases()-1], '1');
00091     EXPECT_EQ(s[s.getNumberBases()], 'N');  // check bounds checker
00092 
00093     s.close();
00094 }
00095 
00096 #if 0
00097 void simplestExample(void)
00098 {
00099     GenomeSequence  reference;
00100     genomeIndex_t   index;
00101 
00102     // a particular reference is set by:
00103     // reference.setFastaName("/usr/cluster/share/karma/human_g1k_v37_12CS.fa")
00104     //
00105     // In the above example, the suffix .fa is stripped and replaced with .umfa,
00106     // which contains the actual file being opened.
00107     //
00108     if (reference.open()) {
00109         perror("GenomeSequence::open");
00110         exit(1);
00111     }
00112 
00113 
00114     index = 1000000000; // 10^9
00115     //
00116     // Write the base at the given index.  Here, index is 0 based,
00117     // and is across the whole genome, as all chromosomes are sequentially
00118     // concatenated, so the allowed range is
00119     //
00120     // 0.. (reference.getChromosomeStart(last) + reference.getChromosomeSize(last))
00121     //
00122     // (where int last = reference.getChromosomeCount() - 1;)
00123     //
00124     std::cout << "base[" << index << "] = " << reference[index] << std::endl;
00125 
00126     //
00127     // Example for finding chromosome and one based chromosome position given
00128     // and absolute position on the genome in 'index':
00129     //
00130     int chr = reference.getChromosome(index);
00131     genomeIndex_t   chrIndex = index - reference.getChromosomeStart(chr) + 1;   // 1-based
00132 
00133     std::cout << "genome index " << index << " corresponds to chromosome " << chr << " position " << chrIndex << std::endl;
00134 
00135     //
00136     // Example for finding an absolute genome index position when the
00137     // chromosome name and one based position are known:
00138     //
00139     const char *chromosomeName = "5";
00140     chr = reference.getChromosome(chromosomeName);     // 0-based
00141     chrIndex = 100000;                      // 1-based
00142 
00143     index = reference.getChromosomeStart(chr) + chrIndex - 1;
00144 
00145     std::cout << "Chromosome '" << chromosomeName << "' position " << chrIndex << " corresponds to genome index position " << index << std::endl;
00146 
00147     reference.close();
00148 }
00149 
00150 void testGenomeSequence(void)
00151 {
00152     GenomeSequence reference;
00153 
00154 #if 0
00155     std::string referenceName = "someotherreference";
00156     if (reference.setFastaName(referenceName)) {
00157         std::cerr << "failed to open reference file "
00158                   << referenceName
00159                   << std::endl;
00160         exit(1);
00161     }
00162 #endif
00163 
00164     std::cerr << "open and prefetch the reference genome: ";
00165 
00166     // open it
00167     if (reference.open()) {
00168         exit(1);
00169     }
00170     std::cerr << "done!" << std::endl;
00171 
00172     //
00173     // For the human genome, genomeIndex ranges from 0 to 3.2x10^9
00174     //
00175     genomeIndex_t   genomeIndex;    // 0 based
00176     unsigned int chromosomeIndex;   // 1 based
00177     unsigned int chromosome;        // 0..23 or so
00178     std::string chromosomeName;
00179 
00180     //
00181     // Here we'll start with a chromosome name, then obtain the genome
00182     // index, and use it to find the base we want:
00183     //
00184     chromosomeName = "2";
00185     chromosomeIndex = 1234567;
00186     // this call is slow (string search for chromsomeName):
00187     genomeIndex = reference.getGenomePosition(chromosomeName.c_str(), chromosomeIndex);
00188     assert(genomeIndex!=INVALID_GENOME_INDEX);
00189     std::cout << "Chromosome " << chromosomeName << ", index ";
00190     std::cout << chromosomeIndex << " contains base " << reference[genomeIndex];
00191     std::cout << " at genome index position " << genomeIndex << std::endl;
00192 
00193     //
00194     // now reverse it - given a genomeIndex from above, find the chromosome
00195     // name and index:
00196     //
00197 
00198     // slow (binary search on genomeIndex):
00199     chromosome = reference.getChromosome(genomeIndex);
00200     unsigned int newChromosomeIndex;
00201     // not slow:
00202     newChromosomeIndex = genomeIndex - reference.getChromosomeStart(chromosome) + 1;
00203 
00204     assert(chromosomeIndex == newChromosomeIndex);
00205 
00206     // more testing... at least test and use PackedRead:
00207     //
00208     PackedRead pr;
00209 
00210     pr.set("ATCGATCG", 0);
00211     assert(pr.size()==8);
00212     assert(pr[0]==GenomeSequence::base2int[(int) 'A']);
00213     assert(pr[1]==GenomeSequence::base2int[(int) 'T']);
00214     assert(pr[2]==GenomeSequence::base2int[(int) 'C']);
00215     assert(pr[3]==GenomeSequence::base2int[(int) 'G']);
00216     pr.set("ATCGATCG", 1);
00217     assert(pr.size()==9);
00218     pr.set("", 0);
00219     assert(pr.size()==0);
00220     pr.set("", 1);
00221     assert(pr.size()==1);
00222     pr.set("", 2);
00223     assert(pr.size()==2);
00224     pr.set("", 3);
00225     assert(pr.size()==3);
00226     assert(pr[0]==GenomeSequence::base2int[(int) 'N']);
00227     assert(pr[1]==GenomeSequence::base2int[(int) 'N']);
00228     assert(pr[2]==GenomeSequence::base2int[(int) 'N']);
00229     pr.set("C", 1);
00230     assert(pr.size()==2);
00231     assert(pr[0]==GenomeSequence::base2int[(int) 'N']);
00232     assert(pr[1]==GenomeSequence::base2int[(int) 'C']);
00233 
00234 }
00235 
00236 #endif
Generated on Wed Nov 17 15:38:29 2010 for StatGen Software by  doxygen 1.6.3