Commit 7067a1c8 authored by Mike Iovine's avatar Mike Iovine
Browse files

Implement better huffman code representation in inflate

parent 4bdde247
No related merge requests found
Showing with 170 additions and 136 deletions
+170 -136
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdbool.h>
#include <math.h>
/* Types of huffman codes */
static const int FIXED = 1;
static const int DYNAMIC = 2;
#define FIXED 1
#define DYNAMIC 2
/* Maximum length for any Huffman code */
#define MAX_LENGTH 15
/* Dummy value for min_codes in huffman_t structs */
#define NO_CODE -1
typedef struct huffman {
/* bl_counts[i] = number of codes of length i */
int bl_counts[MAX_LENGTH + 1];
/* This stores the alphabet.
* alphabet[i] returns an array of alphabet symbols of length bl_counts[i].
*/
int *alphabet[MAX_LENGTH + 1];
/* Suppose we are reading a code and we want an index into the alphabet array.
* These are basically the numerical offsets for such an index.
* So if the code we are reading has value c and is of length i,
* its alphabet character is indexed by (c - min_codes[i]).
*/
int min_codes[MAX_LENGTH + 1];
} huffman_t;
/*
* Constants for FIXED code type
*/
/* Alphabet for codes of length 7, 8, and 9, in order
* Not really intended for use; use HUFFMAN_FIXED instead.
*/
int _FIXED_7[24] = {256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268,
269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279};
int _FIXED_8[152] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72,
73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86,
87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133,
134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 280,
281, 282, 283, 284, 285, 286, 287};
int _FIXED_9[112] = {144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154,
155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165,
166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176,
177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187,
188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198,
199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220,
221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231,
232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242,
243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
254, 255
};
/* Fixed Huffman codes mapping. */
/* Note that we only have codes of length 7, 8, 9 */
huffman_t HUFFMAN_FIXED = {
.bl_counts = {0, 0, 0, 0, 0, 0, 0, 24, 152, 112, 0, 0, 0, 0, 0},
static const int DECODE_7_MIN = 0;
static const int DECODE_7_MAX = 23;
static const int DECODE_7[] = {
256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270,
271, 272, 273, 274, 275, 276, 277, 278, 279
};
.alphabet = {NULL, NULL, NULL, NULL, NULL, NULL, NULL,
_FIXED_7, _FIXED_8, _FIXED_9,
NULL, NULL, NULL, NULL, NULL},
static const int DECODE_8_MIN = 48;
static const int DECODE_8_MAX = 191;
static const int DECODE_8[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73,
74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91,
92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135,
136, 137, 138, 139, 140, 141, 142, 143, 280, 281, 282, 283, 284, 285,
286, 287
};
static const int DECODE_9_MIN = 400;
static const int DECODE_9_MAX = 511;
static const int DECODE_9[] = {
144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158,
159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173,
174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188,
189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203,
204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218,
219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233,
234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248,
249, 250, 251, 252, 253, 254, 255
.min_codes = {NO_CODE, NO_CODE, NO_CODE, NO_CODE, NO_CODE, NO_CODE, NO_CODE,
0, 48, 192, 400,
NO_CODE, NO_CODE, NO_CODE, NO_CODE, NO_CODE}
};
static const int LENGTH_OFFSET = 257;
static const int DECODE_LENGTH[] = {
3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, 35, 43, 51, 59,
67, 83, 99, 115, 131, 163, 195, 227, 258
/* Conversion tables for lengths
* To index into lengths, use (value_read - LENGTH_OFFSET)
* We may have to read additional bits; check LEN_ADDITIONAL for how many
*/
#define LENGTH_OFFSET 257
int LEN_TABLE[29] = {
3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, 35, 43, 51,
59, 67, 83, 99, 115, 131, 163, 195, 227, 258
};
static const int ADD_LENGTHS[] = {
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0
int LEN_ADDITIONAL[29] = {
0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 2, 2, 2, 2,
3, 3, 3, 3, 4, 4, 4, 4,
5, 5, 5, 5, 0
};
static const int DECODE_DIST[] = {
1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,
257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145,
8193, 12289, 16385, 24577
/* Conversion tables for distance codes
* This can be indexed into directly with the distance code.
* Again, we may have to read additional bits to get the distance.
*/
int DIST_TABLE[30] = {
1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, 385, 513,
769, 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577
};
static const int ADD_DISTS[] = {
0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10,
11, 11, 12, 12, 13, 13
int DIST_ADDITIONAL[30] = {
0, 0, 0, 0, 1, 1, 2, 2,
3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
9, 9, 10, 10, 11, 11, 12, 12, 13, 13
};
int match(int chunk, const int decode_table[],
const int decode_min, const int decode_max) {
/* Keep track of the bit we are at.
* DO NOT MODIFY ANYWHERE EXCEPT get_next_bit */
int _CUR_BIT = 0;
if (chunk < decode_min || chunk > decode_max) {
return -1;
}
return decode_table[chunk - decode_min];
}
int get_next_bit(char *buf, int n) {
int buf_pos = n / 8;
int pos = n - 8 * buf_pos;
int get_next_bit(char *buf) {
int buf_pos = _CUR_BIT / 8;
int pos = _CUR_BIT - 8 * buf_pos;
char byte = buf[buf_pos];
char mask = 1 << pos;
_CUR_BIT += 1;
return ((byte & mask) != 0);
}
int get_n_bits(char *buf, int pos, int n, bool reverse) {
int get_n_bits(char *buf, int n, bool reverse) {
int res = 0;
for (int i = 0; i < n; i++) {
if (!reverse) {
res += get_next_bit(buf, pos) << (n - i - 1);
res += get_next_bit(buf) << (n - i - 1);
} else {
res += get_next_bit(buf, pos) << i;
res += get_next_bit(buf) << i;
}
pos += 1;
}
return res;
}
/* Read the block starting at the n-th bit */
int read_block(char *buf, int buf_size, int n, FILE *out) {
/* Keep track of the bit we are at in the buffer */
int pos = n;
int read_chunk(char *buf, huffman_t hf) {
int code = 0;
/* There should be no codes of length 0, so we can start at 1 */
for (int i = 1; i < MAX_LENGTH + 1; i++) {
/* We haven't matched up to this point; shift left, and read another bit */
code <<= 1;
code += get_next_bit(buf);
/* Do we have codes of length i? */
if (hf.alphabet[i] != NULL) {
/* Index into hf.alphabet[i], i.e. values for codes of length i */
int idx = (code - hf.min_codes[i]);
/* If this is a valid index, then we have a match of length i */
if (idx < hf.bl_counts[i]) {
return hf.alphabet[i][idx];
}
}
}
/* Something has gone wrong if we reach here */
return -1;
}
/* Read the block starting at the n-th bit */
void read_block(char *buf, FILE *out) {
/* First bit is the BFINAL flag */
bool bfinal = get_next_bit(buf, pos);
pos += 1;
bool bfinal = get_next_bit(buf);
/* Next two bits are BTYPE */
int btype = get_n_bits(buf, pos, 2, true);
pos += 2;
int btype = get_n_bits(buf, 2, true);
/* By default, use fixed mapping */
huffman_t hf = HUFFMAN_FIXED;
if (btype == DYNAMIC) {
/* TODO */
return 0;
// TODO update hf if dynamic
} else if (btype != FIXED) {
fprintf(stderr, "error: unrecognized btype\n");
exit(1);
}
/* Read to decode when we reach this point. */
int chunk_val;
do
{
/* Read first 7 bytes, stream order */
int chunk = get_n_bits(buf, pos, 7, false);
pos += 7;
chunk_val = match(chunk, DECODE_7, DECODE_7_MIN, DECODE_7_MAX);
/* No match, read another bit */
if (chunk_val == -1) {
chunk <<= 1;
chunk += get_next_bit(buf, pos);
pos += 1;
chunk_val = match(chunk, DECODE_8, DECODE_8_MIN, DECODE_8_MAX);
}
/* Again no match, read another bit */
if (chunk_val == -1) {
chunk += get_next_bit(buf, pos);
pos += 1;
chunk_val = match(chunk, DECODE_9, DECODE_9_MIN, DECODE_9_MAX);
}
/* Literal */
if (chunk_val <= 255) {
fwrite(&chunk_val, 1, sizeof(char), out);
}
/* Length */
else if (chunk_val != 256) {
int length = DECODE_LENGTH[chunk_val - LENGTH_OFFSET];
/* Might need to read some extra bits and add to the length */
int additional_bits = ADD_LENGTHS[chunk_val - LENGTH_OFFSET];
for (int i = 0; i < additional_bits; i++) {
length += get_next_bit(buf, pos) << i;
pos += 1;
}
/* Next 5 bits are a distance */
int dist_val = get_n_bits(buf, pos, 5, false);
pos += 5;
int dist = DECODE_DIST[dist_val];
additional_bits = ADD_DISTS[dist_val];
for (int i = 0; i < additional_bits; i++) {
dist += get_next_bit(buf, pos) << i;
pos += 1;
}
for (int i = 0; i < length; i++) {
{
/* Match a huffman code */
chunk_val = read_chunk(buf, hf);
/* Literal, just write to output buffer */
if (chunk_val < 256) {
fwrite(&chunk_val, 1, sizeof(char), out);
/* We read a length */
} else if (chunk_val > 256) {
int length = LEN_TABLE[chunk_val - LENGTH_OFFSET];
int addit_len = LEN_ADDITIONAL[chunk_val - LENGTH_OFFSET];
length += get_n_bits(buf, addit_len, true);
/* Next 5 bits are the distance code */
int dist_code = get_n_bits(buf, 5, false);
int dist = DIST_TABLE[dist_code];
int addit_dist = DIST_ADDITIONAL[dist_code];
dist += get_n_bits(buf, addit_dist, true);
for (int i = 0; i < length; i++) {
fseek(out, -dist, SEEK_CUR);
int c = fgetc(out);
int val = fgetc(out);
fseek(out, 0, SEEK_END);
fwrite(&c, 1, sizeof(char), out);
fwrite(&val, 1, sizeof(char), out);
}
}
}
while (chunk_val != 256);
while (chunk_val != 256);
/* Skip over filler at the end of the block */
while ((pos - n) % 8 != 0) {
pos += 1;
while (_CUR_BIT % 8 != 0) {
_CUR_BIT += 1;
}
return (pos - n);
}
void inflate(FILE *fp) {
......@@ -198,11 +235,8 @@ void inflate(FILE *fp) {
}
fread(buf, 1, size, fp);
long n_read = 0;
while (n_read < 8 * size) {
n_read += read_block(buf, size, n_read, out);
break;
while (_CUR_BIT < 8 * size) {
read_block(buf, out);
}
fclose(out);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment