libStatGen Software  1
GenomeSequence_test.cpp
00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include <gtest/gtest.h>
00019 
00020 #include "GenomeSequence.h"
00021 
00022 const char* RM_BS_REFERENCE = "rm -f ./phiX-bs.umfa";
00023 const char* RM_CS_REFERENCE = "rm -f ./phiX-cs.umfa";
00024 const char* REFERENCE_NAME = "./phiX.fa";
00025 
00026 TEST(GenomeSequenceTest, staticLookupTest)
00027 {
00028     GenomeSequence s;
00029     // quick sanity check...
00030     EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'A']], 'A');
00031     EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'a']], 'A');
00032     EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'T']], 'T');
00033     EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 't']], 'T');
00034     EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'C']], 'C');
00035     EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'c']], 'C');
00036     EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'G']], 'G');
00037     EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'g']], 'G');
00038     EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'N']], 'N');
00039     EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'n']], 'N');
00040     EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'M']], 'M');
00041     EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'm']], 'M');
00042 
00043     EXPECT_EQ(GenomeSequence::base2int[(int) 'N'], 4);
00044     EXPECT_EQ(GenomeSequence::base2int[(int) 'n'], 4);
00045     EXPECT_EQ(GenomeSequence::base2int[(int) 'A'], 0);
00046     EXPECT_EQ(GenomeSequence::base2int[(int) 'a'], 0);
00047     EXPECT_EQ(GenomeSequence::base2int[(int) 'T'], 3);
00048     EXPECT_EQ(GenomeSequence::base2int[(int) 't'], 3);
00049     EXPECT_EQ(GenomeSequence::base2int[(int) 'C'], 1);
00050     EXPECT_EQ(GenomeSequence::base2int[(int) 'c'], 1);
00051     EXPECT_EQ(GenomeSequence::base2int[(int) 'G'], 2);
00052     EXPECT_EQ(GenomeSequence::base2int[(int) 'g'], 2);
00053 }
00054 
00055 
00056 TEST(GenomeSequenceTest, testBaseSpaceReference)
00057 {
00058     GenomeSequence s;
00059     int exitCode = system(RM_BS_REFERENCE);
00060     EXPECT_EQ(exitCode, 0);
00061 
00062     s.setReferenceName(REFERENCE_NAME);
00063     bool rc = s.create(false);
00064     EXPECT_EQ(rc, false);
00065     EXPECT_EQ(s[0], 'G');
00066     EXPECT_EQ(s[1], 'A');
00067     EXPECT_EQ(s[2], 'G');
00068     EXPECT_EQ(s[s.getNumberBases()-3], 'G');
00069     EXPECT_EQ(s[s.getNumberBases()-2], 'C');
00070     EXPECT_EQ(s[s.getNumberBases()-1], 'A');
00071     EXPECT_EQ(s[s.getNumberBases()], 'N');  // check bounds checker
00072 
00073     s.close();
00074 }
00075 
00076 TEST(GenomeSequenceTest, testColorSpaceReference)
00077 {
00078     GenomeSequence s;
00079     int exitCode = system(RM_CS_REFERENCE);
00080     EXPECT_EQ(exitCode, 0);
00081 
00082     s.setReferenceName(REFERENCE_NAME);
00083     bool rc = s.create(true);
00084 
00085     // NB: I did not calculate these expected values, I just
00086     // read them from the converted genome and set them here.
00087     // So in theory, they should be checked by hand to ensure
00088     // that they are correct.
00089     EXPECT_EQ(rc, false);
00090     EXPECT_EQ(s[0], 'N');   // in color space, first symbol is unknown
00091     EXPECT_EQ(s[1], '2');
00092     EXPECT_EQ(s[2], '2');
00093     EXPECT_EQ(s[s.getNumberBases()-3], '1');
00094     EXPECT_EQ(s[s.getNumberBases()-2], '3');
00095     EXPECT_EQ(s[s.getNumberBases()-1], '1');
00096     EXPECT_EQ(s[s.getNumberBases()], 'N');  // check bounds checker
00097 
00098     s.close();
00099 }
00100 
00101 #if 0
00102 void simplestExample(void)
00103 {
00104     GenomeSequence  reference;
00105     genomeIndex_t   index;
00106 
00107     // a particular reference is set by:
00108     // reference.setFastaName("/usr/cluster/share/karma/human_g1k_v37_12CS.fa")
00109     //
00110     // In the above example, the suffix .fa is stripped and replaced with .umfa,
00111     // which contains the actual file being opened.
00112     //
00113     if (reference.open()) {
00114         perror("GenomeSequence::open");
00115         exit(1);
00116     }
00117 
00118 
00119     index = 1000000000; // 10^9
00120     //
00121     // Write the base at the given index.  Here, index is 0 based,
00122     // and is across the whole genome, as all chromosomes are sequentially
00123     // concatenated, so the allowed range is
00124     //
00125     // 0.. (reference.getChromosomeStart(last) + reference.getChromosomeSize(last))
00126     //
00127     // (where int last = reference.getChromosomeCount() - 1;)
00128     //
00129     std::cout << "base[" << index << "] = " << reference[index] << std::endl;
00130 
00131     //
00132     // Example for finding chromosome and one based chromosome position given
00133     // and absolute position on the genome in 'index':
00134     //
00135     int chr = reference.getChromosome(index);
00136     genomeIndex_t   chrIndex = index - reference.getChromosomeStart(chr) + 1;   // 1-based
00137 
00138     std::cout << "genome index " << index << " corresponds to chromosome " << chr << " position " << chrIndex << std::endl;
00139 
00140     //
00141     // Example for finding an absolute genome index position when the
00142     // chromosome name and one based position are known:
00143     //
00144     const char *chromosomeName = "5";
00145     chr = reference.getChromosome(chromosomeName);     // 0-based
00146     chrIndex = 100000;                      // 1-based
00147 
00148     index = reference.getChromosomeStart(chr) + chrIndex - 1;
00149 
00150     std::cout << "Chromosome '" << chromosomeName << "' position " << chrIndex << " corresponds to genome index position " << index << std::endl;
00151 
00152     reference.close();
00153 }
00154 
00155 void testGenomeSequence(void)
00156 {
00157     GenomeSequence reference;
00158 
00159 #if 0
00160     std::string referenceName = "someotherreference";
00161     if (reference.setFastaName(referenceName)) {
00162         std::cerr << "failed to open reference file "
00163                   << referenceName
00164                   << std::endl;
00165         exit(1);
00166     }
00167 #endif
00168 
00169     std::cerr << "open and prefetch the reference genome: ";
00170 
00171     // open it
00172     if (reference.open()) {
00173         exit(1);
00174     }
00175     std::cerr << "done!" << std::endl;
00176 
00177     //
00178     // For the human genome, genomeIndex ranges from 0 to 3.2x10^9
00179     //
00180     genomeIndex_t   genomeIndex;    // 0 based
00181     unsigned int chromosomeIndex;   // 1 based
00182     unsigned int chromosome;        // 0..23 or so
00183     std::string chromosomeName;
00184 
00185     //
00186     // Here we'll start with a chromosome name, then obtain the genome
00187     // index, and use it to find the base we want:
00188     //
00189     chromosomeName = "2";
00190     chromosomeIndex = 1234567;
00191     // this call is slow (string search for chromsomeName):
00192     genomeIndex = reference.getGenomePosition(chromosomeName.c_str(), chromosomeIndex);
00193     assert(genomeIndex!=INVALID_GENOME_INDEX);
00194     std::cout << "Chromosome " << chromosomeName << ", index ";
00195     std::cout << chromosomeIndex << " contains base " << reference[genomeIndex];
00196     std::cout << " at genome index position " << genomeIndex << std::endl;
00197 
00198     //
00199     // now reverse it - given a genomeIndex from above, find the chromosome
00200     // name and index:
00201     //
00202 
00203     // slow (binary search on genomeIndex):
00204     chromosome = reference.getChromosome(genomeIndex);
00205     unsigned int newChromosomeIndex;
00206     // not slow:
00207     newChromosomeIndex = genomeIndex - reference.getChromosomeStart(chromosome) + 1;
00208 
00209     assert(chromosomeIndex == newChromosomeIndex);
00210 
00211     // more testing... at least test and use PackedRead:
00212     //
00213     PackedRead pr;
00214 
00215     pr.set("ATCGATCG", 0);
00216     assert(pr.size()==8);
00217     assert(pr[0]==GenomeSequence::base2int[(int) 'A']);
00218     assert(pr[1]==GenomeSequence::base2int[(int) 'T']);
00219     assert(pr[2]==GenomeSequence::base2int[(int) 'C']);
00220     assert(pr[3]==GenomeSequence::base2int[(int) 'G']);
00221     pr.set("ATCGATCG", 1);
00222     assert(pr.size()==9);
00223     pr.set("", 0);
00224     assert(pr.size()==0);
00225     pr.set("", 1);
00226     assert(pr.size()==1);
00227     pr.set("", 2);
00228     assert(pr.size()==2);
00229     pr.set("", 3);
00230     assert(pr.size()==3);
00231     assert(pr[0]==GenomeSequence::base2int[(int) 'N']);
00232     assert(pr[1]==GenomeSequence::base2int[(int) 'N']);
00233     assert(pr[2]==GenomeSequence::base2int[(int) 'N']);
00234     pr.set("C", 1);
00235     assert(pr.size()==2);
00236     assert(pr[0]==GenomeSequence::base2int[(int) 'N']);
00237     assert(pr[1]==GenomeSequence::base2int[(int) 'C']);
00238 
00239 }
00240 
00241 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends