summaryrefslogtreecommitdiff
path: root/src/lzw.c
blob: b1617994ae7a5d5a6f9f8abd0e9f5a0e704c2665 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
/*
 * This file is part of NetSurf's LibNSGIF, http://www.netsurf-browser.org/
 * Licensed under the MIT License,
 *                http://www.opensource.org/licenses/mit-license.php
 *
 * Copyright 2017 Michael Drake <michael.drake@codethink.co.uk>
 */

#include <stdint.h>
#include <stdlib.h>
#include <stdbool.h>

#include "lzw.h"

/**
 * \file
 * \brief LZW decompression (implementation)
 *
 * Decoder for GIF LZW data.
 */


/**
 * Context for reading LZW data.
 *
 * LZW data is split over multiple sub-blocks.  Each sub-block has a
 * byte at the start, which says the sub-block size, and then the data.
 * Zero-size sub-blocks have no data, and the biggest sub-block size is
 * 255, which means there are 255 bytes of data following the sub-block
 * size entry.
 *
 * Note that an individual LZW code can be split over up to three sub-blocks.
 */
struct lzw_read_ctx {
	const uint8_t *data;    /**< Pointer to start of input data */
	uint32_t data_len;      /**< Input data length */
	uint32_t data_sb_next;  /**< Offset to sub-block size */

	const uint8_t *sb_data; /**< Pointer to current sub-block in data */
	uint32_t sb_bit;        /**< Current bit offset in sub-block */
	uint32_t sb_bit_count;  /**< Bit count in sub-block */
};

/**
 * LZW dictionary entry.
 *
 * Records in the dictionary are composed of 1 or more entries.
 * Entries point to previous entries which can be followed to compose
 * the complete record.  To compose the record in reverse order, take
 * the `last_value` from each entry, and move to the previous entry.
 * If the previous_entry's index is < the current clear_code, then it
 * is the last entry in the record.
 */
struct lzw_dictionary_entry {
	uint8_t last_value;      /**< Last value for record ending at entry. */
	uint8_t first_value;     /**< First value for entry's record. */
	uint16_t previous_entry; /**< Offset in dictionary to previous entry. */
};

/**
 * LZW decompression context.
 */
struct lzw_ctx {
	/** Input reading context */
	struct lzw_read_ctx input;

	uint32_t previous_code;       /**< Code read from input previously. */
	uint32_t previous_code_first; /**< First value of previous code. */

	uint32_t initial_code_size;     /**< Starting LZW code size. */
	uint32_t current_code_size;     /**< Current LZW code size. */
	uint32_t current_code_size_max; /**< Max code value for current size. */

	uint32_t clear_code; /**< Special Clear code value */
	uint32_t eoi_code;   /**< Special End of Information code value */

	uint32_t current_entry; /**< Next position in table to fill. */

	/** Output value stack. */
	uint8_t stack_base[1 << LZW_CODE_MAX];

	/** LZW decode dictionary. Generated during decode. */
	struct lzw_dictionary_entry table[1 << LZW_CODE_MAX];
};


/* Exported function, documented in lzw.h */
lzw_result lzw_context_create(struct lzw_ctx **ctx)
{
	struct lzw_ctx *c = malloc(sizeof(*c));
	if (c == NULL) {
		return LZW_NO_MEM;
	}

	*ctx = c;
	return LZW_OK;
}


/* Exported function, documented in lzw.h */
void lzw_context_destroy(struct lzw_ctx *ctx)
{
	free(ctx);
}


/**
 * Advance the context to the next sub-block in the input data.
 *
 * \param[in] ctx  LZW reading context, updated on success.
 * \return LZW_OK or LZW_OK_EOD on success, appropriate error otherwise.
 */
static lzw_result lzw__block_advance(struct lzw_read_ctx *ctx)
{
	uint32_t block_size;
	uint32_t next_block_pos = ctx->data_sb_next;
	const uint8_t *data_next = ctx->data + next_block_pos;

	if (next_block_pos >= ctx->data_len) {
		return LZW_NO_DATA;
	}

	block_size = *data_next;

	if ((next_block_pos + block_size) >= ctx->data_len) {
		return LZW_NO_DATA;
	}

	ctx->sb_bit = 0;
	ctx->sb_bit_count = block_size * 8;

	if (block_size == 0) {
		ctx->data_sb_next += 1;
		return LZW_OK_EOD;
	}

	ctx->sb_data = data_next + 1;
	ctx->data_sb_next += block_size + 1;

	return LZW_OK;
}


/**
 * Get the next LZW code of given size from the raw input data.
 *
 * Reads codes from the input data stream coping with GIF data sub-blocks.
 *
 * \param[in]  ctx        LZW reading context, updated.
 * \param[in]  code_size  Size of LZW code to get from data.
 * \param[out] code_out   Returns an LZW code on success.
 * \return LZW_OK or LZW_OK_EOD on success, appropriate error otherwise.
 */
static inline lzw_result lzw__next_code(
		struct lzw_read_ctx *ctx,
		uint8_t code_size,
		uint32_t *code_out)
{
	uint32_t code = 0;
	uint8_t current_bit = ctx->sb_bit & 0x7;
	uint8_t byte_advance = (current_bit + code_size) >> 3;

	if (ctx->sb_bit + code_size < ctx->sb_bit_count) {
		/* Fast path: code fully inside this sub-block */
		const uint8_t *data = ctx->sb_data + (ctx->sb_bit >> 3);
		switch (byte_advance) {
			case 2: code |= data[2] << 16;
			case 1: code |= data[1] <<  8;
			case 0: code |= data[0] <<  0;
		}
		ctx->sb_bit += code_size;
	} else {
		/* Slow path: code spans sub-blocks */
		uint8_t byte = 0;
		uint8_t bits_remaining_0 = (code_size < (8 - current_bit)) ?
				code_size : (8 - current_bit);
		uint8_t bits_remaining_1 = code_size - bits_remaining_0;
		uint8_t bits_used[3] = {
			[0] = bits_remaining_0,
			[1] = bits_remaining_1 < 8 ? bits_remaining_1 : 8,
			[2] = bits_remaining_1 - 8,
		};

		while (true) {
			const uint8_t *data = ctx->sb_data;
			lzw_result res;

			/* Get any data from end of this sub-block */
			while (byte <= byte_advance &&
					ctx->sb_bit < ctx->sb_bit_count) {
				code |= data[ctx->sb_bit >> 3] << (byte << 3);
				ctx->sb_bit += bits_used[byte];
				byte++;
			}

			/* Check if we have all we need */
			if (byte > byte_advance) {
				break;
			}

			/* Move to next sub-block */
			res = lzw__block_advance(ctx);
			if (res != LZW_OK) {
				return res;
			}
		}
	}

	*code_out = (code >> current_bit) & ((1 << code_size) - 1);
	return LZW_OK;
}


/**
 * Clear LZW code dictionary.
 *
 * \param[in]  ctx            LZW reading context, updated.
 * \param[out] stack_pos_out  Returns current stack position.
 * \return LZW_OK or error code.
 */
static lzw_result lzw__clear_codes(
		struct lzw_ctx *ctx,
		const uint8_t ** const stack_pos_out)
{
	uint32_t code;
	uint8_t *stack_pos;

	/* Reset dictionary building context */
	ctx->current_code_size = ctx->initial_code_size + 1;
	ctx->current_code_size_max = (1 << ctx->current_code_size) - 1;;
	ctx->current_entry = (1 << ctx->initial_code_size) + 2;

	/* There might be a sequence of clear codes, so process them all */
	do {
		lzw_result res = lzw__next_code(&ctx->input,
				ctx->current_code_size, &code);
		if (res != LZW_OK) {
			return res;
		}
	} while (code == ctx->clear_code);

	/* The initial code must be from the initial dictionary. */
	if (code > ctx->clear_code) {
		return LZW_BAD_ICODE;
	}

	/* Record this initial code as "previous" code, needed during decode. */
	ctx->previous_code = code;
	ctx->previous_code_first = code;

	/* Reset the stack, and add first non-clear code added as first item. */
	stack_pos = ctx->stack_base;
	*stack_pos++ = code;

	*stack_pos_out = stack_pos;
	return LZW_OK;
}


/* Exported function, documented in lzw.h */
lzw_result lzw_decode_init(
		struct lzw_ctx *ctx,
		const uint8_t *compressed_data,
		uint32_t compressed_data_len,
		uint32_t compressed_data_pos,
		uint8_t code_size,
		const uint8_t ** const stack_base_out,
		const uint8_t ** const stack_pos_out)
{
	struct lzw_dictionary_entry *table = ctx->table;

	/* Initialise the input reading context */
	ctx->input.data = compressed_data;
	ctx->input.data_len = compressed_data_len;
	ctx->input.data_sb_next = compressed_data_pos;

	ctx->input.sb_bit = 0;
	ctx->input.sb_bit_count = 0;

	/* Initialise the dictionary building context */
	ctx->initial_code_size = code_size;

	ctx->clear_code = (1 << code_size) + 0;
	ctx->eoi_code   = (1 << code_size) + 1;

	/* Initialise the standard dictionary entries */
	for (uint32_t i = 0; i < ctx->clear_code; ++i) {
		table[i].first_value = i;
		table[i].last_value  = i;
	}

	*stack_base_out = ctx->stack_base;
	return lzw__clear_codes(ctx, stack_pos_out);
}


/* Exported function, documented in lzw.h */
lzw_result lzw_decode(struct lzw_ctx *ctx,
		const uint8_t ** const stack_pos_out)
{
	lzw_result res;
	uint32_t code_new;
	uint32_t code_out;
	struct lzw_dictionary_entry *entry;
	uint8_t *stack_pos = ctx->stack_base;
	uint32_t clear_code = ctx->clear_code;
	uint32_t current_entry = ctx->current_entry;
	struct lzw_dictionary_entry * const table = ctx->table;

	/* Get a new code from the input */
	res = lzw__next_code(&ctx->input, ctx->current_code_size, &code_new);
	if (res != LZW_OK) {
		return res;
	}

	if (code_new == clear_code) {
		/* Got Clear code */
		return lzw__clear_codes(ctx, stack_pos_out);

	} else if (code_new == ctx->eoi_code) {
		/* Got End of Information code */
		return LZW_EOI_CODE;
	}

	if (current_entry >= (1 << LZW_CODE_MAX)) {
		/* No space in table for new entries, only Clear and
		 * End of Information codes were allowed. */
		return LZW_BAD_DATA;
	}

	entry = table + current_entry;
	if (code_new > current_entry) {
		/* Code is invalid */
		return LZW_BAD_CODE;
	} else if (code_new < current_entry) {
		/* Code is in table */
		code_out = code_new;
		entry->last_value = table[code_new].first_value;
	} else {
		/* Code not in table */
		*stack_pos++ = ctx->previous_code_first;
		code_out = ctx->previous_code;
		entry->last_value = ctx->previous_code_first;
	}
	entry->first_value    = ctx->previous_code_first;
	entry->previous_entry = ctx->previous_code;

	/* Ensure code size is increased, if needed. */
	if (current_entry == ctx->current_code_size_max) {
		if (ctx->current_code_size < LZW_CODE_MAX) {
			ctx->current_code_size++;
			ctx->current_code_size_max =
					(1 << ctx->current_code_size) - 1;
		}
	}
	ctx->current_entry++;

	ctx->previous_code_first = table[code_new].first_value;
	ctx->previous_code = code_new;

	/* Put rest of data for this code on output stack.
	 * Note, in the case of "code not in table", the last entry of the
	 * current code has already been placed on the stack above. */
	while (code_out > clear_code) {
		entry = table + code_out;
		*stack_pos++ = entry->last_value;
		code_out = entry->previous_entry;
	}
	*stack_pos++ = table[code_out].last_value;

	*stack_pos_out = stack_pos;
	return LZW_OK;
}