Commit 89bf9c90 authored by Mike Iovine's avatar Mike Iovine
Browse files

Start tests, move some files around

parent 470866d7
No related merge requests found
Pipeline #30548 canceled with stage
Showing with 974 additions and 467 deletions
+974 -467
all : myzip0 myunzip0 huffman inflate
all : myzip0 myunzip0 huffman inflate test_inflate
myzip0 : myzip0.c
cc myzip0.c -o myzip0
......@@ -9,8 +9,12 @@ myunzip0 : myunzip0.c
huffman : huffman.c
cc huffman.c -o huffman
inflate : inflate.c
cc inflate.c -o inflate
inflate : include/inflate.h src/inflate/main.c src/inflate/inflate.c
gcc -I include src/inflate/main.c src/inflate/inflate.c -o inflate
test_inflate : src/inflate/inflate.c tests/inflate_test.cpp
g++ -I include src/inflate/inflate.c tests/inflate_test.cpp -o test_inflate -l gtest
./test_inflate
clean :
rm myzip0 myunzip0 huffman inflate
rm myzip0 myunzip0 huffman inflate test_inflate
This diff is collapsed.
......@@ -22,6 +22,9 @@ typedef struct huffman {
} huffman_t;
/* Reset the position state variable */
void reset_pos();
/* Create a Huffman mapping from a sequence of lengths. */
huffman_t *make_huffman(int *lens, int *alphabet, int n_symbols);
......
#include <stdio.h>
#include <assert.h>
#include <string.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdbool.h>
#include "inflate.h"
/* Types of huffman codes */
#define FIXED 1
#define DYNAMIC 2
/* Dummy value for min_codes in huffman_t structs */
#define NO_CODE -1
/*
* Constants for FIXED code type
*/
/* Alphabet for codes of length 7, 8, and 9, in order
* Not really intended for use; use HUFFMAN_FIXED instead.
*/
int _FIXED_7[24] = {256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268,
269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279};
int _FIXED_8[152] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72,
73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86,
87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133,
134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 280,
281, 282, 283, 284, 285, 286, 287};
int _FIXED_9[112] = {144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154,
155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165,
166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176,
177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187,
188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198,
199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220,
221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231,
232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242,
243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
254, 255
};
/* Note that we only have codes of length 7, 8, 9 */
huffman_t HUFFMAN_FIXED = {
.bl_counts = {0, 0, 0, 0, 0, 0, 0, 24, 152, 112, 0, 0, 0, 0, 0},
.alphabet = {NULL, NULL, NULL, NULL, NULL, NULL, NULL,
_FIXED_7, _FIXED_8, _FIXED_9,
NULL, NULL, NULL, NULL, NULL},
.min_codes = {NO_CODE, NO_CODE, NO_CODE, NO_CODE, NO_CODE, NO_CODE, NO_CODE,
0, 48, 192, 400,
NO_CODE, NO_CODE, NO_CODE, NO_CODE, NO_CODE}
};
/* Number of characters in the code-length alphabet */
#define N_CL_ALPHABET 19
int CL_ALPHABET[N_CL_ALPHABET] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18
};
/* Number of characters in the distance alphabet */
#define N_DISTS 30
int DIST_ALPHABET[N_DISTS] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
21, 22, 23, 24, 25, 26, 27, 28, 29
};
huffman_t HUFFMAN_FIXED_DISTS = {
.bl_counts = {0, 0, 0, 0, 0, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
.alphabet = {NULL, NULL, NULL, NULL, NULL, DIST_ALPHABET, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL},
.min_codes = {NO_CODE, NO_CODE, NO_CODE, NO_CODE, NO_CODE, 0,
NO_CODE, NO_CODE, NO_CODE, NO_CODE, NO_CODE, NO_CODE,
NO_CODE, NO_CODE, NO_CODE, NO_CODE}
};
/* Conversion tables for lengths
* To index into lengths, use (value_read - LENGTH_OFFSET)
* We may have to read additional bits; check LEN_ADDITIONAL for how many
*/
#define LENGTH_OFFSET 257
int LEN_TABLE[29] = {
3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, 35, 43, 51,
59, 67, 83, 99, 115, 131, 163, 195, 227, 258
};
int LEN_ADDITIONAL[29] = {
0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 2, 2, 2, 2,
3, 3, 3, 3, 4, 4, 4, 4,
5, 5, 5, 5, 0
};
/* Conversion tables for distance codes
* This can be indexed into directly with the distance code.
* Again, we may have to read additional bits to get the distance.
*/
int DIST_TABLE[30] = {
1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, 385, 513,
769, 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577
};
int DIST_ADDITIONAL[30] = {
0, 0, 0, 0, 1, 1, 2, 2,
3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
9, 9, 10, 10, 11, 11, 12, 12, 13, 13
};
/*
* Constants for DYNAMIC code type
*/
/* Number of characters in the literal-length alphabet */
#define N_LITERALS 286
int LITERAL_ALPHABET[N_LITERALS] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56,
57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92,
93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123,
124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183,
184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198,
199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213,
214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228,
229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243,
244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258,
259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273,
274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285
};
/* After the dynamic block header, a sequence of lengths
* occurs; the lengths correspond to characters in the
* code-length alphabet in this order.
*/
int CL_ORDER[N_CL_ALPHABET] = {
16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
};
/* Keep track of the bit we are at.
* DO NOT MODIFY ANYWHERE EXCEPT get_next_bit */
int _CUR_BIT = 0;
int get_next_bit(char *buf) {
int buf_pos = _CUR_BIT / 8;
int pos = _CUR_BIT - 8 * buf_pos;
char byte = buf[buf_pos];
char mask = 1 << pos;
_CUR_BIT += 1;
return ((byte & mask) != 0);
}
int get_n_bits(char *buf, int n, bool reverse) {
int res = 0;
for (int i = 0; i < n; i++) {
if (!reverse) {
res += get_next_bit(buf) << (n - i - 1);
} else {
res += get_next_bit(buf) << i;
}
}
return res;
}
int read_chunk(char *buf, huffman_t hf) {
int code = 0;
/* There should be no codes of length 0, so we can start at 1 */
for (int i = 1; i < MAX_LENGTH + 1; i++) {
/* We haven't matched up to this point; shift left, and read another bit */
code <<= 1;
code += get_next_bit(buf);
/* Do we have codes of length i? */
if (hf.alphabet[i] != NULL) {
/* Index into hf.alphabet[i], i.e. values for codes of length i */
int idx = (code - hf.min_codes[i]);
/* If this is a valid index, then we have a match of length i */
if (idx < hf.bl_counts[i]) {
return hf.alphabet[i][idx];
}
}
}
/* Something has gone wrong if we reach here */
return -1;
}
huffman_t *make_huffman(int *lens, int *alphabet, int n_symbols) {
huffman_t *hf = (huffman_t *) malloc(sizeof(huffman_t));
/* Initialize fields */
for (int i = 0; i < MAX_LENGTH + 1; i++) {
hf->bl_counts[i] = 0;
hf->alphabet[i] = NULL;
hf->min_codes[i] = NO_CODE;
}
/* Update bl_counts with the lens */
for (int i = 0; i < n_symbols; i++) {
int len = lens[i];
if (len != 0) {
hf->bl_counts[len] += 1;
}
}
/*
* Construct min_codes, the value of the smallest code for each
* code length
*/
int code = 0;
for (int i = 1; i < MAX_LENGTH + 1; i++) {
code = (code + hf->bl_counts[i - 1]) << 1;
if (hf->bl_counts[i] != 0) hf->min_codes[i] = code;
}
/*
* Start assigning codes to alphabet characters.
*/
for (int len = 0; len < MAX_LENGTH + 1; len++) {
if (hf->bl_counts[len] != 0) {
int *symbols = (int *) malloc(hf->bl_counts[len] * sizeof(int));
int cur = 0;
for (int sym = 0; sym < n_symbols; sym++) {
if (lens[sym] == len) {
symbols[cur] = sym;
cur += 1;
}
}
hf->alphabet[len] = symbols;
}
}
return hf;
}
void destroy_huffman(huffman_t *hf) {
for (int i = 0; i < MAX_LENGTH + 1; i++) {
if (hf->alphabet[i] != NULL) free(hf->alphabet[i]);
}
free(hf);
}
int *read_lens(char *buf, huffman_t *hf_codes, int num_symbols, int num_codes) {
int *lens = (int *) malloc(num_symbols * sizeof(int));
for (int i = 0; i < num_symbols; i++) lens[i] = 0;
int cur_lit = 0;
while (cur_lit < num_codes) {
int len = read_chunk(buf, *hf_codes);
if (len <= 15) {
lens[cur_lit] = len;
cur_lit += 1;
} else if (len == 16) {
int rep = get_n_bits(buf, 2, true) + 3;
int prev = lens[cur_lit - 1];
for (int j = 0; j < rep; j++) {
lens[cur_lit] = prev;
cur_lit += 1;
}
} else if (len == 17) {
int rep = get_n_bits(buf, 3, true) + 3;
for (int j = 0; j < rep; j++) {
lens[cur_lit] = 0;
cur_lit += 1;
}
} else if (len == 18) {
int rep = get_n_bits(buf, 7, true) + 11;
for (int j = 0; j < rep; j++) {
lens[cur_lit] = 0;
cur_lit += 1;
}
} else {
fprintf(stderr, "error: invalid length");
exit(1);
}
}
return lens;
}
void read_block(char *buf, FILE *out) {
/* First bit is the BFINAL flag */
bool bfinal = get_next_bit(buf);
/* Next two bits are BTYPE */
int btype = get_n_bits(buf, 2, true);
/* By default, use fixed mapping */
huffman_t *hf = &HUFFMAN_FIXED;
huffman_t *hf_dist = &HUFFMAN_FIXED_DISTS;
if (btype == DYNAMIC) {
/* Read dynamic block header */
int hlit = get_n_bits(buf, 5, true);
int hdist = get_n_bits(buf, 5, true);
int hclen = get_n_bits(buf, 4, true);
/* Read in the code lengths for the code-length alphabet */
int *lens_cl = (int *) malloc(N_CL_ALPHABET * sizeof(int));
for (int i = 0; i < N_CL_ALPHABET; i++) lens_cl[i] = 0;
for (int i = 0; i < hclen + 4; i++) {
int len = get_n_bits(buf, 3, true);
int cl = CL_ORDER[i];
lens_cl[cl] = len;
}
huffman_t *hf_codes = make_huffman(lens_cl, CL_ALPHABET, N_CL_ALPHABET);
/* Now read in the code lengths for the literal alphabet */
int *lens_lit = read_lens(buf, hf_codes, N_LITERALS, hlit + 257);
/* Huffman mapping for the length-literal alphabet */
huffman_t *hf_lit = make_huffman(lens_lit, LITERAL_ALPHABET, N_LITERALS);
/* Read in the lengths for the distance alphabet */
int *lens_dist = read_lens(buf, hf_codes, N_DISTS, hdist + 1);
/* Huffman mapping for the distance code alphabet */
huffman_t *hf_dist_new = make_huffman(lens_dist, DIST_ALPHABET, N_DISTS);
/* Cleanup */
destroy_huffman(hf_codes);
free(lens_cl);
free(lens_lit);
free(lens_dist);
hf = hf_lit;
hf_dist = hf_dist_new;
} else if (btype != FIXED) {
fprintf(stderr, "error: unrecognized btype\n");
exit(1);
}
/* Read to decode when we reach this point. */
int chunk_val;
do
{
/* Match a huffman code */
chunk_val = read_chunk(buf, *hf);
/* Literal, just write to output buffer */
if (chunk_val < 256) {
fwrite(&chunk_val, 1, sizeof(char), out);
/* We read a length */
} else if (chunk_val > 256) {
int length = LEN_TABLE[chunk_val - LENGTH_OFFSET];
int addit_len = LEN_ADDITIONAL[chunk_val - LENGTH_OFFSET];
length += get_n_bits(buf, addit_len, true);
/* Read in the distance code */
int dist_code = read_chunk(buf, *hf_dist);
assert(dist_code != -1);
int dist = DIST_TABLE[dist_code];
int addit_dist = DIST_ADDITIONAL[dist_code];
dist += get_n_bits(buf, addit_dist, true);
for (int i = 0; i < length; i++) {
fseek(out, -dist, SEEK_CUR);
int val = fgetc(out);
fseek(out, 0, SEEK_END);
fwrite(&val, 1, sizeof(char), out);
}
}
}
while (chunk_val != 256);
/* Skip over filler at the end of the block */
while (_CUR_BIT % 8 != 0) {
_CUR_BIT += 1;
}
if (btype == DYNAMIC) {
destroy_huffman(hf);
destroy_huffman(hf_dist);
}
}
void truncate_suffix(char *fname) {
char *dot = strrchr(fname, '.');
char *suffix = ".deflate";
for (int i = 0; i < 8; i++) {
if (dot == NULL || *(dot + i) != suffix[i]) {
fprintf(stderr, "error: must be a .deflate file");
exit(1);
}
}
*dot = '\0';
}
void inflate(FILE *fp, char *fname) {
truncate_suffix(fname);
/* Create output file */
FILE *out = fopen(fname, "wb+");
/* Read the file into a buffer */
fseek(fp, 0, SEEK_END);
long size = ftell(fp);
rewind(fp);
char *buf = (char *) malloc(size);
if (!buf) {
fprintf(stderr, "error: memory error\n");
exit(1);
}
fread(buf, 1, size, fp);
while (_CUR_BIT < 8 * size) {
read_block(buf, out);
}
fclose(out);
free(buf);
}
int main(int argc, char *argv[]) {
if (argc != 2) {
fprintf(stderr, "usage: inflate [file]\n");
return 1;
}
char *fname = argv[1];
FILE *fp = fopen(fname, "rb");
if (!fp) {
fprintf(stderr, "error: file not found\n");
return 1;
}
inflate(fp, fname);
fclose(fp);
return 0;
}
#include <gtest/gtest.h>
#include <inflate.h>
/* Test that we read the bits in the correct order */
TEST (BufferTests, GetNextBitTest) {
reset_pos();
/* The 4-byte buffer in memory */
unsigned char buf[4] = {175, /* = 10101111 */
240, /* = 11110000 */
15, /* = 00001111 */
204}; /* = 11001100 */
/*
* The real stream order.
* If the bytes are laid out in memory as
* [b0b1..b7] [b8...b15] ...
* Then the correct stream order is:
* [b7...b0] [b15...b8] ...
*/
int stream_order[32] = {1, 1, 1, 1, 0, 1, 0, 1,
0, 0, 0, 0, 1, 1, 1, 1,
1, 1, 1, 1, 0, 0, 0, 0,
0, 0, 1, 1, 0, 0, 1, 1};
for (int i = 0; i < 32; i++) {
// TODO this cast should probably not be necessary
// change type signature to unsigned char in inflate
ASSERT_EQ (get_next_bit((char *) buf), stream_order[i]);
}
}
/*
* Test that we get the correct numerical value when
* reading from a buffer.
* In this test, all integers are interpreted in
* STREAM ORDER. That is, if the bitstream
* is (b0, b1, b2, ..., bn), we interpret
* b0 as the MSB and bn as the LSB. Note that the order
* of the bitstream is not necessarily the order that
* the bits are laid out in memory; see get_next_bit().
*/
TEST (BufferTests, StreamOrderTest) {
reset_pos();
unsigned char buf[2] = {220, /* = 11011100 */
145}; /* = 10010001 */
/* First 3 bits are 001 */
ASSERT_EQ (get_n_bits((char *) buf, 3, false), 1);
/* Next 5 are 11011 */
ASSERT_EQ (get_n_bits((char *) buf, 5, false), 27);
/* Next 2 are 10 */
ASSERT_EQ (get_n_bits((char *) buf, 2, false), 2);
/* Next 6 are 001001 */
ASSERT_EQ (get_n_bits((char *) buf, 6, false), 9);
}
/*
* Like the test above, but interprets integers in REVERSE STREAM ORDER.
* That is, if we read in the bitstream (b0, b1, ..., bn),
* b0 is the LSB and bn is the MSB.
*/
TEST (BufferTests, RevStreamOrderTest) {
reset_pos();
unsigned char buf[2] = {220, /* = 11011100 */
145}; /* = 10010001 */
/* First 3 bits are 001 */
ASSERT_EQ (get_n_bits((char *) buf, 3, true), 4);
/* Next 5 are 11011 */
ASSERT_EQ (get_n_bits((char *) buf, 5, true), 27);
/* Next 2 are 10 */
ASSERT_EQ (get_n_bits((char *) buf, 2, true), 1);
/* Next 6 are 001001 */
ASSERT_EQ (get_n_bits((char *) buf, 6, true), 36);
}
// TODO
TEST (HuffmanTests, TestMakeHuffman) {}
TEST (HuffmanTests, TestReadChunk) {}
TEST (HuffmanTests, TestReadLens) {}
int main(int argc, char **argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment