libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #include <gtest/gtest.h> 00019 00020 #include "GenomeSequence.h" 00021 00022 const char* RM_BS_REFERENCE = "rm -f ./phiX-bs.umfa"; 00023 const char* RM_CS_REFERENCE = "rm -f ./phiX-cs.umfa"; 00024 const char* REFERENCE_NAME = "./phiX.fa"; 00025 00026 TEST(GenomeSequenceTest, staticLookupTest) 00027 { 00028 GenomeSequence s; 00029 // quick sanity check... 00030 EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'A']], 'A'); 00031 EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'a']], 'A'); 00032 EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'T']], 'T'); 00033 EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 't']], 'T'); 00034 EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'C']], 'C'); 00035 EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'c']], 'C'); 00036 EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'G']], 'G'); 00037 EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'g']], 'G'); 00038 EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'N']], 'N'); 00039 EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'n']], 'N'); 00040 EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'M']], 'M'); 00041 EXPECT_EQ(GenomeSequence::int2base[GenomeSequence::base2int[(int) 'm']], 'M'); 00042 00043 EXPECT_EQ(GenomeSequence::base2int[(int) 'N'], 4); 00044 EXPECT_EQ(GenomeSequence::base2int[(int) 'n'], 4); 00045 EXPECT_EQ(GenomeSequence::base2int[(int) 'A'], 0); 00046 EXPECT_EQ(GenomeSequence::base2int[(int) 'a'], 0); 00047 EXPECT_EQ(GenomeSequence::base2int[(int) 'T'], 3); 00048 EXPECT_EQ(GenomeSequence::base2int[(int) 't'], 3); 00049 EXPECT_EQ(GenomeSequence::base2int[(int) 'C'], 1); 00050 EXPECT_EQ(GenomeSequence::base2int[(int) 'c'], 1); 00051 EXPECT_EQ(GenomeSequence::base2int[(int) 'G'], 2); 00052 EXPECT_EQ(GenomeSequence::base2int[(int) 'g'], 2); 00053 } 00054 00055 00056 TEST(GenomeSequenceTest, testBaseSpaceReference) 00057 { 00058 GenomeSequence s; 00059 int exitCode = system(RM_BS_REFERENCE); 00060 EXPECT_EQ(exitCode, 0); 00061 00062 s.setReferenceName(REFERENCE_NAME); 00063 bool rc = s.create(false); 00064 EXPECT_EQ(rc, false); 00065 EXPECT_EQ(s[0], 'G'); 00066 EXPECT_EQ(s[1], 'A'); 00067 EXPECT_EQ(s[2], 'G'); 00068 EXPECT_EQ(s[s.getNumberBases()-3], 'G'); 00069 EXPECT_EQ(s[s.getNumberBases()-2], 'C'); 00070 EXPECT_EQ(s[s.getNumberBases()-1], 'A'); 00071 EXPECT_EQ(s[s.getNumberBases()], 'N'); // check bounds checker 00072 00073 s.close(); 00074 } 00075 00076 TEST(GenomeSequenceTest, testColorSpaceReference) 00077 { 00078 GenomeSequence s; 00079 int exitCode = system(RM_CS_REFERENCE); 00080 EXPECT_EQ(exitCode, 0); 00081 00082 s.setReferenceName(REFERENCE_NAME); 00083 bool rc = s.create(true); 00084 00085 // NB: I did not calculate these expected values, I just 00086 // read them from the converted genome and set them here. 00087 // So in theory, they should be checked by hand to ensure 00088 // that they are correct. 00089 EXPECT_EQ(rc, false); 00090 EXPECT_EQ(s[0], 'N'); // in color space, first symbol is unknown 00091 EXPECT_EQ(s[1], '2'); 00092 EXPECT_EQ(s[2], '2'); 00093 EXPECT_EQ(s[s.getNumberBases()-3], '1'); 00094 EXPECT_EQ(s[s.getNumberBases()-2], '3'); 00095 EXPECT_EQ(s[s.getNumberBases()-1], '1'); 00096 EXPECT_EQ(s[s.getNumberBases()], 'N'); // check bounds checker 00097 00098 s.close(); 00099 } 00100 00101 #if 0 00102 void simplestExample(void) 00103 { 00104 GenomeSequence reference; 00105 genomeIndex_t index; 00106 00107 // a particular reference is set by: 00108 // reference.setFastaName("/usr/cluster/share/karma/human_g1k_v37_12CS.fa") 00109 // 00110 // In the above example, the suffix .fa is stripped and replaced with .umfa, 00111 // which contains the actual file being opened. 00112 // 00113 if (reference.open()) { 00114 perror("GenomeSequence::open"); 00115 exit(1); 00116 } 00117 00118 00119 index = 1000000000; // 10^9 00120 // 00121 // Write the base at the given index. Here, index is 0 based, 00122 // and is across the whole genome, as all chromosomes are sequentially 00123 // concatenated, so the allowed range is 00124 // 00125 // 0.. (reference.getChromosomeStart(last) + reference.getChromosomeSize(last)) 00126 // 00127 // (where int last = reference.getChromosomeCount() - 1;) 00128 // 00129 std::cout << "base[" << index << "] = " << reference[index] << std::endl; 00130 00131 // 00132 // Example for finding chromosome and one based chromosome position given 00133 // and absolute position on the genome in 'index': 00134 // 00135 int chr = reference.getChromosome(index); 00136 genomeIndex_t chrIndex = index - reference.getChromosomeStart(chr) + 1; // 1-based 00137 00138 std::cout << "genome index " << index << " corresponds to chromosome " << chr << " position " << chrIndex << std::endl; 00139 00140 // 00141 // Example for finding an absolute genome index position when the 00142 // chromosome name and one based position are known: 00143 // 00144 const char *chromosomeName = "5"; 00145 chr = reference.getChromosome(chromosomeName); // 0-based 00146 chrIndex = 100000; // 1-based 00147 00148 index = reference.getChromosomeStart(chr) + chrIndex - 1; 00149 00150 std::cout << "Chromosome '" << chromosomeName << "' position " << chrIndex << " corresponds to genome index position " << index << std::endl; 00151 00152 reference.close(); 00153 } 00154 00155 void testGenomeSequence(void) 00156 { 00157 GenomeSequence reference; 00158 00159 #if 0 00160 std::string referenceName = "someotherreference"; 00161 if (reference.setFastaName(referenceName)) { 00162 std::cerr << "failed to open reference file " 00163 << referenceName 00164 << std::endl; 00165 exit(1); 00166 } 00167 #endif 00168 00169 std::cerr << "open and prefetch the reference genome: "; 00170 00171 // open it 00172 if (reference.open()) { 00173 exit(1); 00174 } 00175 std::cerr << "done!" << std::endl; 00176 00177 // 00178 // For the human genome, genomeIndex ranges from 0 to 3.2x10^9 00179 // 00180 genomeIndex_t genomeIndex; // 0 based 00181 unsigned int chromosomeIndex; // 1 based 00182 unsigned int chromosome; // 0..23 or so 00183 std::string chromosomeName; 00184 00185 // 00186 // Here we'll start with a chromosome name, then obtain the genome 00187 // index, and use it to find the base we want: 00188 // 00189 chromosomeName = "2"; 00190 chromosomeIndex = 1234567; 00191 // this call is slow (string search for chromsomeName): 00192 genomeIndex = reference.getGenomePosition(chromosomeName.c_str(), chromosomeIndex); 00193 assert(genomeIndex!=INVALID_GENOME_INDEX); 00194 std::cout << "Chromosome " << chromosomeName << ", index "; 00195 std::cout << chromosomeIndex << " contains base " << reference[genomeIndex]; 00196 std::cout << " at genome index position " << genomeIndex << std::endl; 00197 00198 // 00199 // now reverse it - given a genomeIndex from above, find the chromosome 00200 // name and index: 00201 // 00202 00203 // slow (binary search on genomeIndex): 00204 chromosome = reference.getChromosome(genomeIndex); 00205 unsigned int newChromosomeIndex; 00206 // not slow: 00207 newChromosomeIndex = genomeIndex - reference.getChromosomeStart(chromosome) + 1; 00208 00209 assert(chromosomeIndex == newChromosomeIndex); 00210 00211 // more testing... at least test and use PackedRead: 00212 // 00213 PackedRead pr; 00214 00215 pr.set("ATCGATCG", 0); 00216 assert(pr.size()==8); 00217 assert(pr[0]==GenomeSequence::base2int[(int) 'A']); 00218 assert(pr[1]==GenomeSequence::base2int[(int) 'T']); 00219 assert(pr[2]==GenomeSequence::base2int[(int) 'C']); 00220 assert(pr[3]==GenomeSequence::base2int[(int) 'G']); 00221 pr.set("ATCGATCG", 1); 00222 assert(pr.size()==9); 00223 pr.set("", 0); 00224 assert(pr.size()==0); 00225 pr.set("", 1); 00226 assert(pr.size()==1); 00227 pr.set("", 2); 00228 assert(pr.size()==2); 00229 pr.set("", 3); 00230 assert(pr.size()==3); 00231 assert(pr[0]==GenomeSequence::base2int[(int) 'N']); 00232 assert(pr[1]==GenomeSequence::base2int[(int) 'N']); 00233 assert(pr[2]==GenomeSequence::base2int[(int) 'N']); 00234 pr.set("C", 1); 00235 assert(pr.size()==2); 00236 assert(pr[0]==GenomeSequence::base2int[(int) 'N']); 00237 assert(pr[1]==GenomeSequence::base2int[(int) 'C']); 00238 00239 } 00240 00241 #endif