libStatGen Software  1
BaseAsciiMap.h
00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef _BASE_ASCII_MAP_H
00019 #define _BASE_ASCII_MAP_H
00020 
00021 #include "StringBasics.h"
00022 
00023 /// Map between characters and the associated base type.
00024 class BaseAsciiMap
00025 {
00026 public:
00027     /// Value associated with 'N' in the ascii to base map (bad read).
00028     static const int baseNIndex = 004;
00029     /// Value associated with any non-base character in the ascii to base
00030     /// map (unknown, bad data).
00031     static const int baseXIndex = 005;
00032 
00033     // Two arrays for converting back and forth between base pair character
00034     // value (ASCII) to a base integer in the range 0..3.  Note there is actually
00035     // a value 4 and 5, for 'N' (indelible) and 'M' (unknown to me).
00036     //
00037     /// Convert from int representation to the base.
00038     static const char int2base[];
00039     /// Convert from int representation to colorspace representation.
00040     static const char int2colorSpace[];
00041     static unsigned char base2complement[];
00042 
00043     /// The type of space (color or base) to use in the mapping.
00044     enum SPACE_TYPE {
00045         /// Base decision on the first raw seq character/type has yet 
00046         /// to be determined.
00047         UNKNOWN,
00048         BASE_SPACE, ///< Bases only (A,C,G,T,N).
00049         COLOR_SPACE ///< Color space only (0,1,2,3,.).
00050     };
00051 
00052     /// Map ASCII values to a 2 (or 3) bit encoding for the base pair value for
00053     /// both base and color space.
00054     /// 'A'/'a'/'0' -> 0; 'C'/'c'/'1' -> 1; 'G'/'g'/'2' -> 2; 'T'/'t'/'3' -> 3;
00055     /// 'N'/'n'/'4' -> 4; anything else -> 5.
00056     static unsigned char baseColor2int[256+1];   // base space read (ATCG)
00057     /// Map ASCII values to a 2 (or 3) bit encoding for the base pair value for
00058     /// just base space (ACTGNactgn).
00059     /// 'A'/'a' -> 0;  'C'/'c' -> 1;  'G'/'g' -> 2;  'T'/'t' -> 3;
00060     /// 'N'/'n' -> 4; anything else -> 5.
00061     static unsigned char base2int[256+1];        // base space read (ATCG)
00062     /// Map ASCII values to a 2 (or 3) bit encoding for the base pair value for
00063     /// just color space (0123).
00064     /// '0' -> 0; '1' -> 1; '2' -> 2; '3' -> 3; '4' -> 4; anything else -> 5.
00065     static unsigned char color2int[256+1];       // base space read (ATCG)
00066 
00067 public:
00068     BaseAsciiMap();
00069     ~BaseAsciiMap();
00070 
00071     /// Set the base type based on the passed in option.
00072     inline void setBaseMapType(SPACE_TYPE spaceType)
00073     {
00074         resetPrimerCount();
00075         //First check to see if it is in base space.
00076         switch (spaceType)
00077         {
00078             case BASE_SPACE:
00079                 // base space.
00080                 myBase2IntMapPtr = base2int;
00081                 break;
00082             case COLOR_SPACE:
00083                 // color space.
00084                 myBase2IntMapPtr = color2int;
00085                 break;
00086             default:
00087                 // Unknown map type, zero the pointer.
00088                 myBase2IntMapPtr = NULL;
00089                 break;
00090         }
00091     };
00092 
00093     /// Returns the baseIndex value for the character passed in.
00094     inline int getBaseIndex(const char& letter)
00095     {
00096         if (myBase2IntMapPtr == NULL)
00097         {
00098             // Check to see if we have hit the number of primer bases.
00099             if (myPrimerCount < myNumPrimerBases)
00100             {
00101                 // Still expecting primer bases, so lookup
00102                 // the letter in the base map.
00103                 ++myPrimerCount;
00104                 return(base2int[(int)letter]);
00105             }
00106 
00107             // Have already processed all the primers, so determine
00108             // whether this is base or color space.
00109 
00110             // Need to determime the base type.
00111             setBaseMapType(letter);
00112 
00113             // If it is still null, return invalid.  Will be set when the first
00114             // letter is either color or base.
00115             if (myBase2IntMapPtr == NULL)
00116             {
00117                 return(baseXIndex);
00118             }
00119         }
00120 
00121         // Also check if configured as color space that the primers are correct.
00122         if ((myBase2IntMapPtr == color2int) && (myPrimerCount < myNumPrimerBases))
00123         {
00124             // Still expecting primer bases, so lookup
00125             // the letter in the base map.
00126             ++myPrimerCount;
00127             return(base2int[(int)letter]);
00128         }
00129 
00130         return myBase2IntMapPtr[(int)letter];
00131     }
00132 
00133     /// Return the space type that is currently set.
00134     inline SPACE_TYPE getSpaceType()
00135     {
00136         if (myBase2IntMapPtr == base2int)
00137         {
00138             return(BASE_SPACE);
00139         }
00140         else if (myBase2IntMapPtr == color2int)
00141         {
00142             return(COLOR_SPACE);
00143         }
00144         else
00145         {
00146             return(UNKNOWN);
00147         }
00148     }
00149 
00150     /// Set the number of primer bases expected before the actual
00151     /// base/color space type occurs for the rest of the entries.
00152     void setNumPrimerBases(int numPrimerBases)
00153     {
00154         myNumPrimerBases = numPrimerBases;
00155     }
00156 
00157     /// Reset the number of primers to 0.
00158     void resetPrimerCount()
00159     {
00160         myPrimerCount = 0;
00161     };
00162 
00163     /// Reset the base mapping type to UNKNOWN.
00164     void resetBaseMapType()
00165     {
00166         myBase2IntMapPtr = NULL;
00167         resetPrimerCount();
00168     };
00169 
00170 private:
00171     // Set the base type based on the passed in letter.
00172     // If the letter is in neither the color space or the base space, both
00173     // will be allowed.
00174     inline void setBaseMapType(const char& letter)
00175     {
00176         //First check to see if it is in base space.
00177         if (base2int[(int)letter] != baseXIndex)
00178         {
00179             // This is a valid base space index, so it is base space.
00180             myBase2IntMapPtr = base2int;
00181         }
00182         else if (color2int[(int)letter] != baseXIndex)
00183         {
00184             // This is a valid color space index, so it is base space.
00185             myBase2IntMapPtr = color2int;
00186         }
00187         else
00188         {
00189             // Unknown map type, zero the pointer.
00190             myBase2IntMapPtr = NULL;
00191         }
00192     };
00193 
00194 
00195     // The number of primer bases to expect for a color-space file.
00196     unsigned int myNumPrimerBases;
00197 
00198     // This is the number of primer bases that have been seen since
00199     // the map type was set/reset.
00200     unsigned int myPrimerCount;
00201 
00202     unsigned char* myBase2IntMapPtr;
00203 };
00204 
00205 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends