BaseAsciiMap.h

00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef _BASE_ASCII_MAP_H
00019 #define _BASE_ASCII_MAP_H
00020 
00021 #include "StringBasics.h"
00022 
00023 /// Map between characters and the associated base type.
00024 class BaseAsciiMap
00025 {
00026 public:
00027     /// Value associated with 'A' in the ascii to base map.
00028     static const int baseAIndex = 000;
00029     /// Value associated with 'T' in the ascii to base map.
00030     static const int baseTIndex = 001;
00031     /// Value associated with 'C' in the ascii to base map.
00032     static const int baseCIndex = 002;
00033     /// Value associated with 'G' in the ascii to base map.
00034     static const int baseGIndex = 003;
00035     /// Value associated with 'N' in the ascii to base map (bad read).
00036     static const int baseNIndex = 004;
00037     /// Value associated with any non-base character in the ascii to base
00038     /// map (unknown, bad data).
00039     static const int baseXIndex = 005;
00040 
00041     // Two arrays for converting back and forth between base pair character
00042     // value (ASCII) to a base integer in the range 0..3.  Note there is actually
00043     // a value 4 and 5, for 'N' (indelible) and 'M' (unknown to me).
00044     //
00045     /// Convert from int representation to the base.
00046     static const char int2base[];
00047     /// Convert from int representation to colorspace representation.
00048     static const char int2colorSpace[];
00049     static unsigned char base2complement[];
00050 
00051     /// The type of space (color or base) to use in the mapping.
00052     enum SPACE_TYPE {
00053         /// Base decision on the first raw seq character/type has yet 
00054         /// to be determined.
00055         UNKNOWN,
00056         BASE_SPACE, ///< Bases only (A,C,G,T,N).
00057         COLOR_SPACE ///< Color space only (0,1,2,3,.).
00058     };
00059 
00060     /// Map ASCII values to a 2 (or 3) bit encoding for the base pair value for
00061     /// both base and color space.
00062     /// 'A'/'a'/'0' -> 0; 'C'/'c'/'1' -> 1; 'G'/'g'/'2' -> 2; 'T'/'t'/'3' -> 3;
00063     /// 'N'/'n'/'4' -> 4; anything else -> 5.
00064     static unsigned char baseColor2int[256+1];   // base space read (ATCG)
00065     /// Map ASCII values to a 2 (or 3) bit encoding for the base pair value for
00066     /// just base space (ACTGNactgn).
00067     /// 'A'/'a' -> 0;  'C'/'c' -> 1;  'G'/'g' -> 2;  'T'/'t' -> 3;
00068     /// 'N'/'n' -> 4; anything else -> 5.
00069     static unsigned char base2int[256+1];        // base space read (ATCG)
00070     /// Map ASCII values to a 2 (or 3) bit encoding for the base pair value for
00071     /// just color space (0123).
00072     /// '0' -> 0; '1' -> 1; '2' -> 2; '3' -> 3; '4' -> 4; anything else -> 5.
00073     static unsigned char color2int[256+1];       // base space read (ATCG)
00074 
00075 public:
00076     BaseAsciiMap();
00077     ~BaseAsciiMap();
00078 
00079     /// Set the base type based on the passed in option.
00080     inline void setBaseMapType(SPACE_TYPE spaceType)
00081     {
00082         resetPrimerCount();
00083         //First check to see if it is in base space.
00084         switch (spaceType)
00085         {
00086             case BASE_SPACE:
00087                 // base space.
00088                 myBase2IntMapPtr = base2int;
00089                 break;
00090             case COLOR_SPACE:
00091                 // color space.
00092                 myBase2IntMapPtr = color2int;
00093                 break;
00094             default:
00095                 // Unknown map type, zero the pointer.
00096                 myBase2IntMapPtr = NULL;
00097                 break;
00098         }
00099     };
00100 
00101     /// Returns the baseIndex value for the character passed in.
00102     inline int getBaseIndex(const char& letter)
00103     {
00104         if (myBase2IntMapPtr == NULL)
00105         {
00106             // Check to see if we have hit the number of primer bases.
00107             if (myPrimerCount < myNumPrimerBases)
00108             {
00109                 // Still expecting primer bases, so lookup
00110                 // the letter in the base map.
00111                 ++myPrimerCount;
00112                 return(base2int[(int)letter]);
00113             }
00114 
00115             // Have already processed all the primers, so determine
00116             // whether this is base or color space.
00117 
00118             // Need to determime the base type.
00119             setBaseMapType(letter);
00120 
00121             // If it is still null, return invalid.  Will be set when the first
00122             // letter is either color or base.
00123             if (myBase2IntMapPtr == NULL)
00124             {
00125                 return(baseXIndex);
00126             }
00127         }
00128 
00129         // Also check if configured as color space that the primers are correct.
00130         if ((myBase2IntMapPtr == color2int) && (myPrimerCount < myNumPrimerBases))
00131         {
00132             // Still expecting primer bases, so lookup
00133             // the letter in the base map.
00134             ++myPrimerCount;
00135             return(base2int[(int)letter]);
00136         }
00137 
00138         return myBase2IntMapPtr[(int)letter];
00139     }
00140 
00141     /// Return the space type that is currently set.
00142     inline SPACE_TYPE getSpaceType()
00143     {
00144         if (myBase2IntMapPtr == base2int)
00145         {
00146             return(BASE_SPACE);
00147         }
00148         else if (myBase2IntMapPtr == color2int)
00149         {
00150             return(COLOR_SPACE);
00151         }
00152         else
00153         {
00154             return(UNKNOWN);
00155         }
00156     }
00157 
00158     /// Set the number of primer bases expected before the actual
00159     /// base/color space type occurs for the rest of the entries.
00160     void setNumPrimerBases(int numPrimerBases)
00161     {
00162         myNumPrimerBases = numPrimerBases;
00163     }
00164 
00165     /// Reset the number of primers to 0.
00166     void resetPrimerCount()
00167     {
00168         myPrimerCount = 0;
00169     };
00170 
00171     /// Reset the base mapping type to UNKNOWN.
00172     void resetBaseMapType()
00173     {
00174         myBase2IntMapPtr = NULL;
00175         resetPrimerCount();
00176     };
00177 
00178 private:
00179     // Set the base type based on the passed in letter.
00180     // If the letter is in neither the color space or the base space, both
00181     // will be allowed.
00182     inline void setBaseMapType(const char& letter)
00183     {
00184         //First check to see if it is in base space.
00185         if (base2int[(int)letter] != baseXIndex)
00186         {
00187             // This is a valid base space index, so it is base space.
00188             myBase2IntMapPtr = base2int;
00189         }
00190         else if (color2int[(int)letter] != baseXIndex)
00191         {
00192             // This is a valid color space index, so it is base space.
00193             myBase2IntMapPtr = color2int;
00194         }
00195         else
00196         {
00197             // Unknown map type, zero the pointer.
00198             myBase2IntMapPtr = NULL;
00199         }
00200     };
00201 
00202 
00203     // The number of primer bases to expect for a color-space file.
00204     unsigned int myNumPrimerBases;
00205 
00206     // This is the number of primer bases that have been seen since
00207     // the map type was set/reset.
00208     unsigned int myPrimerCount;
00209 
00210     unsigned char* myBase2IntMapPtr;
00211 };
00212 
00213 #endif
Generated on Tue Sep 6 17:52:00 2011 for libStatGen Software by  doxygen 1.6.3