libStatGen Software  1
MiniDeflate.cpp
00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #include "MiniDeflate.h"
00019 
00020 // Convenient constants and macros
00021 //
00022 #define EMPTY_KEY    123
00023 #define uchar        unsigned char
00024 
00025 #ifndef min
00026 #define min(a,b)   (((a)<(b))?(a):(b))
00027 #endif
00028 
00029 MiniDeflate::MiniDeflate()
00030 {
00031     buffer = new uchar [BUFFER_SIZE + 5];
00032     hash_keys = new uchar [HASH_SIZE];
00033     hash_values = new uchar * [HASH_SIZE * HASH_DEPTH];
00034 }
00035 
00036 MiniDeflate::~MiniDeflate()
00037 {
00038     delete [] buffer;
00039     delete [] hash_keys;
00040     delete [] hash_values;
00041 }
00042 
00043 void MiniDeflate::EvaluateMatch(unsigned char * in, int len, int hash,
00044                                 unsigned char * & best_pos, int & best_match)
00045 {
00046     int max = min(len, 0xFFFF + 66);
00047 
00048     for (int i = HASH_DEPTH; i > 0; i--)
00049         // Check each possible match (up to HASH_DEPTH)
00050     {
00051         uchar * pos = hash_values[hash * HASH_DEPTH + ((hash_keys[hash] + i) % HASH_DEPTH)];
00052 
00053         if (pos == NULL || in - pos >= 0x4001) break;
00054 
00055         int match = 0;
00056 
00057         while (match < max && pos[match] == in[match])
00058             match++;
00059 
00060         if (match > best_match)
00061         {
00062             best_match = match;
00063             best_pos = pos;
00064         }
00065     }
00066 
00067     // If string seems pretty unique, add to hash table
00068     if (best_match < OKAY_MATCH)
00069     {
00070         int delta = hash_keys[hash] = (uchar)((hash_keys[hash] + 1) & 7);
00071         hash_values[hash * 8 + delta] = in;
00072     }
00073 }
00074 
00075 void MiniDeflate::QuoteLiterals(unsigned char * & in, int literal,
00076                                 unsigned char * & out, int & buffer_len,
00077                                 FILE * output)
00078 {
00079     if (buffer_len < 0)
00080     {
00081         fwrite(buffer, out - buffer, 1, output);
00082         buffer_len = BUFFER_SIZE;
00083         out = buffer;
00084     }
00085 
00086     while (buffer_len < literal)
00087     {
00088         literal -= buffer_len;
00089         while (buffer_len--)
00090         {
00091             *out = *in;
00092             in++;
00093             out++;
00094         }
00095         fwrite(buffer, BUFFER_SIZE, 1, output);
00096         buffer_len = BUFFER_SIZE;
00097         out = buffer;
00098     }
00099 
00100     while (literal--)
00101     {
00102         *out = *in;
00103         in++;
00104         out++;
00105         buffer_len--;
00106     }
00107 }
00108 
00109 void MiniDeflate::OutputLiterals(unsigned char * & in, int literal,
00110                                  unsigned char * & out, int & buffer_len,
00111                                  FILE * output)
00112 {
00113     while (literal > 0)
00114         if (literal < 16)
00115         {
00116             *out = (char) literal;
00117             out++;
00118             buffer_len--;
00119             QuoteLiterals(in, literal, out, buffer_len, output);
00120             break;
00121         }
00122         else if (literal < 31)
00123         {
00124             *out = 15;
00125             out++;
00126             buffer_len--;
00127             QuoteLiterals(in, 15, out, buffer_len, output);
00128             *out = (uchar)(literal - 15);
00129             out++;
00130             buffer_len--;
00131             QuoteLiterals(in, literal - 15, out, buffer_len, output);
00132             break;
00133         }
00134         else
00135         {
00136             int length = min(literal, 0xFFFF + 31);
00137             literal   -= length;
00138             length    -= 31;
00139 
00140             *out = 0;
00141             out++;
00142             *out = (uchar)(length >> 8);
00143             out++;
00144             *out = (uchar)(length & 0xFF);
00145             out++;
00146             buffer_len -= 3;
00147 
00148             QuoteLiterals(in, length + 31, out, buffer_len, output);
00149         }
00150 }
00151 
00152 
00153 void MiniDeflate::Deflate(FILE * output, void * void_input, size_t len)
00154 {
00155     uchar    * in  = (uchar *) void_input;
00156     uchar    * out = (uchar *) buffer;
00157     int buffer_len = BUFFER_SIZE;
00158 
00159     for (int i = 0; i < HASH_SIZE; i++) hash_keys[i] = EMPTY_KEY;
00160 
00161     uchar * in2 = in;
00162 
00163     while (len > 2)
00164     {
00165         // Hash the current input value
00166         int hash = ((in[0] << 16) | (in[1] << 8) | in[2]) % HASH_SIZE;
00167 
00168         if (hash_keys[hash] != EMPTY_KEY)
00169             // Possible matches in hash table
00170         {
00171             int best_match = 0;
00172             uchar * best_pos = NULL;
00173 
00174             EvaluateMatch(in, len, hash, best_pos, best_match);
00175 
00176             // If there are no decent matches
00177             if (best_match < 3)
00178             {
00179                 in++;
00180                 len--;
00181                 continue;
00182             }
00183 
00184             // Try look ahead if match isn't great
00185             while (best_match < OKAY_MATCH && len > 3)
00186             {
00187                 // Peek to see if we could get a better match
00188                 int next_hash = ((in[1] << 16) | (in[2] << 8) | in[3]) % HASH_SIZE;
00189 
00190                 if (hash_keys[next_hash] == EMPTY_KEY) break;
00191 
00192                 int next_match = 0;
00193                 uchar * next_pos = NULL;
00194 
00195                 EvaluateMatch(in + 1, len - 1, next_hash, next_pos, next_match);
00196 
00197                 // Didn't find a better match
00198                 if (next_match <= best_match + 1) break;
00199 
00200                 // Found a better match, so try again
00201                 in++;
00202                 len--;
00203                 best_match = next_match;
00204                 best_pos = next_pos;
00205             }
00206 
00207             int best_offset = in - best_pos - 1;
00208 
00209             // This is where we output stuff
00210             // Check if we have some literals to output first
00211             OutputLiterals(in2, in - in2, out, buffer_len, output);
00212 
00213             in2 = in += best_match;
00214             len -= best_match;
00215 
00216             if (best_match < 17 && best_offset < 0x1000)
00217             {
00218                 *out = (uchar)(((best_match - 1) << 4) | (best_offset >> 8));
00219                 out++;
00220                 *out = (uchar)(best_offset & 0xFF);
00221                 out++;
00222                 buffer_len -= 2;
00223             }
00224             else if (best_match < 66)
00225             {
00226                 *out = (uchar)(16 | (best_offset >> 10));
00227                 out++;
00228                 *out = (uchar)((best_offset >> 2) & 0xFF);
00229                 out++;
00230                 *out = (uchar)((best_offset << 6) | (best_match - 2));
00231                 out++;
00232                 buffer_len -= 3;
00233             }
00234             else
00235             {
00236                 *out = (uchar)(16 | (best_offset >> 10));
00237                 out++;
00238                 *out = (uchar)((best_offset >> 2) & 0xFF);
00239                 out++;
00240                 *out = (uchar)(best_offset << 6);
00241                 out++;
00242                 best_match -= 66;
00243                 *out = (uchar)(best_match >> 8);
00244                 out++;
00245                 *out = (uchar)(best_match & 0xFF);
00246                 out++;
00247                 buffer_len -= 5;
00248             }
00249 
00250             if (buffer_len <= 0)
00251             {
00252                 fwrite(buffer, out - buffer, 1, output);
00253                 buffer_len = BUFFER_SIZE;
00254                 out = buffer;
00255             }
00256         }
00257         // Never seen this sequence before
00258         else
00259         {
00260             hash_keys[hash] = 0;
00261             for (int i = 1; i < HASH_DEPTH; i++) hash_values[hash * 8 + i] = NULL;
00262             hash_values[hash * 8] = in;
00263             in++;
00264             len--;
00265         }
00266     }
00267 
00268     // Check if we have some trailing literals to output
00269     in += len;
00270     OutputLiterals(in2, in - in2, out, buffer_len, output);
00271 
00272     // Flush output
00273     if (out != buffer) fwrite(buffer, out - buffer, 1, output);
00274 }
00275 
00276 void MiniDeflate::CiteLiteral(unsigned char * & out, int literal,
00277                               unsigned char * & in, int & buffer_len,
00278                               FILE * input)
00279 {
00280     while (buffer_len < literal)
00281     {
00282         literal -= buffer_len;
00283         while (buffer_len--)
00284         {
00285             *out = *in;
00286             in++;
00287             out++;
00288         }
00289         buffer_len = fread(buffer + 5, 1, BUFFER_SIZE, input);
00290         in = buffer + 5;
00291     }
00292 
00293     while (literal--)
00294     {
00295         *out = *in;
00296         in++;
00297         out++;
00298         buffer_len--;
00299     }
00300 }
00301 
00302 
00303 void MiniDeflate::Inflate(FILE * input, void * void_output, size_t len)
00304 {
00305     uchar    * out = (uchar *) void_output;
00306     uchar    * in  = (uchar *) buffer + 5;
00307     int buffer_len = BUFFER_SIZE;
00308 
00309     buffer_len = fread(buffer + 5, 1, BUFFER_SIZE, input);
00310 
00311     while (len)
00312     {
00313         int match_len = *in >> 4;
00314 
00315         // Matching a literal
00316         if (match_len == 0)
00317         {
00318             match_len = *in & 0x0F;
00319             in++, buffer_len--;
00320 
00321             // If match_len == 0 then string is longer than 30 characters
00322             // Strings of 16 - 30 characters are encoded as two short strings
00323             if (match_len == 0)
00324             {
00325                 match_len = (in[0] << 8) + in[1] + 31;
00326                 in += 2;
00327                 buffer_len -= 2;
00328             }
00329 
00330             CiteLiteral(out, match_len, in, buffer_len, input);
00331             len -= match_len;
00332         }
00333         // Long match, 14 bit offset
00334         else if (match_len == 1)
00335         {
00336             int offset = (((in[0] & 0x0F) << 10) | (in[1] << 2) | (in[2] >> 6)) + 1;
00337             match_len  = (in[2] & 0x3F) + 2;
00338             in += 3;
00339             buffer_len -= 3;
00340 
00341             if (match_len == 2)
00342             {
00343                 match_len = ((in[0] << 8) | in[1]) + 66;
00344                 in  += 2;
00345                 buffer_len -= 2;
00346             }
00347 
00348             uchar * match_pos = out - offset;
00349             len -= match_len;
00350             while (match_len--)
00351             {
00352                 *out = *match_pos;
00353                 out++, match_pos++;
00354             }
00355         }
00356         // Typical short match
00357         else
00358         {
00359             int offset = (((in[0] & 0x0F) << 8) | in[1]) + 1;
00360             in += 2;
00361             buffer_len -= 2;
00362 
00363             uchar * match_pos = out - offset;
00364             len -= ++match_len;
00365             while (match_len--)
00366             {
00367                 *out = *match_pos;
00368                 out++, match_pos++;
00369             }
00370         }
00371 
00372         if (buffer_len < 5)
00373         {
00374             uchar * in2 = (uchar *) buffer + 5 - buffer_len;
00375             while (in2 != buffer + 5)
00376             {
00377                 *in2 = *in;
00378                 in2++;
00379                 in++;
00380             }
00381 
00382             in = buffer + 5 - buffer_len;
00383             buffer_len += fread(buffer + 5, 1, BUFFER_SIZE, input);
00384         }
00385     }
00386 
00387     if (buffer_len) fseek(input, -buffer_len, SEEK_CUR);
00388 }
00389 
00390 
00391 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends