libStatGen Software  1
MiniDeflate.h
00001 /*
00002  *  Copyright (C) 2010  Regents of the University of Michigan
00003  *
00004  *   This program is free software: you can redistribute it and/or modify
00005  *   it under the terms of the GNU General Public License as published by
00006  *   the Free Software Foundation, either version 3 of the License, or
00007  *   (at your option) any later version.
00008  *
00009  *   This program is distributed in the hope that it will be useful,
00010  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *   GNU General Public License for more details.
00013  *
00014  *   You should have received a copy of the GNU General Public License
00015  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef __MINIDEFLATE_H__
00019 #define __MINIDEFLATE_H__
00020 
00021 #include <stdio.h>
00022 
00023 // MiniDeflate reads and writes files in a simple Deflate like format
00024 // A quick overview of this format follows, at the bottom of this file
00025 //
00026 
00027 // Performance tuning constants
00028 //
00029 
00030 // Hash table size is HASH_SIZE (a prime)
00031 #define HASH_SIZE    4093
00032 // Hash table depth is HASH_DEPTH (a power of 2)
00033 #define HASH_DEPTH   8
00034 // Matches that are not at least OKAY_MATCH chars are added to hash table
00035 #define OKAY_MATCH   32
00036 // Buffer size for FILE I/O
00037 #define BUFFER_SIZE  (32 * 1024)
00038 
00039 class MiniDeflate
00040 {
00041 public:
00042     MiniDeflate();
00043     ~MiniDeflate();
00044 
00045     void Deflate(FILE * output, void * input, size_t bytes);
00046     void Inflate(FILE * input, void * ouput, size_t bytes);
00047 
00048 private:
00049     unsigned char *  buffer;
00050     unsigned char *  hash_keys;
00051     unsigned char ** hash_values;
00052 
00053     // Inline functions used during file compression
00054     inline void EvaluateMatch(unsigned char * in, int len, int hash,
00055                               unsigned char * & best_pos, int & best_match);
00056     inline void QuoteLiterals(unsigned char * & in, int literal,
00057                               unsigned char * & out, int & buffer_len,
00058                               FILE * output);
00059     inline void OutputLiterals(unsigned char * & in, int literal,
00060                                unsigned char * & out, int & buffer_len,
00061                                FILE * output);
00062     inline void CiteLiteral(unsigned char * & out, int literal,
00063                             unsigned char * & in, int & buffer_len,
00064                             FILE * input);
00065 };
00066 
00067 // Format specification for deflate files
00068 //
00069 // A compressed file is a sequence of bytes {0 .. N}.
00070 // Each byte is a sequence of bits [0 .. 7] with 0 as the Most Significant Bit.
00071 //
00072 // The following tokens are recognized:
00073 //
00074 // Literal quotes -- refer to unique strings
00075 //
00076 //   BYTE0    BYTE1     BYTE2       Description
00077 //     0       HI        LO         Quote of 31 bytes of more
00078 //                                  Followed by (HI << 8 + LO + 31) quoted chars
00079 //    0:4|LEN                       Quote of up to 1-15 bytes
00080 //                                  Followed by LEN quoted chars
00081 //
00082 // String matches -- refer to previous strings in the input stream
00083 //
00084 //   BYTE0    BYTE1     BYTE2     BYTE3   BYTE4     Description
00085 //  1:4|OFF   OFF1     OFF2:2|0    HI      LO       Long match of > 66 bytes
00086 //                                                  Offset of OFF|OFF1|OFF2 + 1
00087 //                                                  Length of HI|LO + 66
00088 //  1:4|OFF   OFF1     OFF2:2|LEN                   Distant match of < 66 bytes
00089 //                                                  Offset of OFF|OFF1|OFF2 + 1
00090 //                                                  Length of LEN + 2
00091 //  LEN|OFF   OFF1                                  Nearby short match
00092 //                                                  Offset OFF|OFF1 + 1
00093 //                                                  Length LEN
00094 //
00095 
00096 // NOTE: When partitioning bytes, I use the notation X:n|Y so that
00097 // X takes the n MSB bits of byte and Y takes the remaining bits.
00098 
00099 
00100 #endif
00101 
00102 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends