libStatGen Software
1
|
00001 /* 00002 * Copyright (C) 2010 Regents of the University of Michigan 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #ifndef __MINIDEFLATE_H__ 00019 #define __MINIDEFLATE_H__ 00020 00021 #include <stdio.h> 00022 00023 // MiniDeflate reads and writes files in a simple Deflate like format 00024 // A quick overview of this format follows, at the bottom of this file 00025 // 00026 00027 // Performance tuning constants 00028 // 00029 00030 // Hash table size is HASH_SIZE (a prime) 00031 #define HASH_SIZE 4093 00032 // Hash table depth is HASH_DEPTH (a power of 2) 00033 #define HASH_DEPTH 8 00034 // Matches that are not at least OKAY_MATCH chars are added to hash table 00035 #define OKAY_MATCH 32 00036 // Buffer size for FILE I/O 00037 #define BUFFER_SIZE (32 * 1024) 00038 00039 class MiniDeflate 00040 { 00041 public: 00042 MiniDeflate(); 00043 ~MiniDeflate(); 00044 00045 void Deflate(FILE * output, void * input, size_t bytes); 00046 void Inflate(FILE * input, void * ouput, size_t bytes); 00047 00048 private: 00049 unsigned char * buffer; 00050 unsigned char * hash_keys; 00051 unsigned char ** hash_values; 00052 00053 // Inline functions used during file compression 00054 inline void EvaluateMatch(unsigned char * in, int len, int hash, 00055 unsigned char * & best_pos, int & best_match); 00056 inline void QuoteLiterals(unsigned char * & in, int literal, 00057 unsigned char * & out, int & buffer_len, 00058 FILE * output); 00059 inline void OutputLiterals(unsigned char * & in, int literal, 00060 unsigned char * & out, int & buffer_len, 00061 FILE * output); 00062 inline void CiteLiteral(unsigned char * & out, int literal, 00063 unsigned char * & in, int & buffer_len, 00064 FILE * input); 00065 }; 00066 00067 // Format specification for deflate files 00068 // 00069 // A compressed file is a sequence of bytes {0 .. N}. 00070 // Each byte is a sequence of bits [0 .. 7] with 0 as the Most Significant Bit. 00071 // 00072 // The following tokens are recognized: 00073 // 00074 // Literal quotes -- refer to unique strings 00075 // 00076 // BYTE0 BYTE1 BYTE2 Description 00077 // 0 HI LO Quote of 31 bytes of more 00078 // Followed by (HI << 8 + LO + 31) quoted chars 00079 // 0:4|LEN Quote of up to 1-15 bytes 00080 // Followed by LEN quoted chars 00081 // 00082 // String matches -- refer to previous strings in the input stream 00083 // 00084 // BYTE0 BYTE1 BYTE2 BYTE3 BYTE4 Description 00085 // 1:4|OFF OFF1 OFF2:2|0 HI LO Long match of > 66 bytes 00086 // Offset of OFF|OFF1|OFF2 + 1 00087 // Length of HI|LO + 66 00088 // 1:4|OFF OFF1 OFF2:2|LEN Distant match of < 66 bytes 00089 // Offset of OFF|OFF1|OFF2 + 1 00090 // Length of LEN + 2 00091 // LEN|OFF OFF1 Nearby short match 00092 // Offset OFF|OFF1 + 1 00093 // Length LEN 00094 // 00095 00096 // NOTE: When partitioning bytes, I use the notation X:n|Y so that 00097 // X takes the n MSB bits of byte and Y takes the remaining bits. 00098 00099 00100 #endif 00101 00102