Power-Programmierung

home *** CD-ROM | disk | FTP | other *** search

/ Power-Programmierung / CD2.mdf / c / library / dos / packer / zip4c / im_lmat.c < prev next >

Wrap

C/C++ Source or Header | 1991-10-03 | 27.0 KB | 726 lines

/* Copyright (C) 1990,1991 Mark Adler, Richard B. Wales, and Jean-loup Gailly. Permission is granted to any individual or institution to use, copy, or redistribute this software so long as all of the original files are included unmodified, that it is not sold for profit, and that this copyright notice is retained. */ /* * im_lmat.c by Jean-loup Gailly. * * PURPOSE * * Identify new text as repetitions of old text within a fixed- * length sliding window trailing behind the new text. * * DISCUSSION * * The "implosion" process depends on being able to identify portions * of the input text which are identical to earlier input (within a * sliding window trailing behind the input currently being processed). * * The most straightforward technique turns out to be the fastest for * most input files: try all possible matches and select the longest. * The key feature is of this algorithm is that insertion and deletions * from the string dictionary are very simple and thus fast. Insertions * and deletions are performed at each input character, whereas string * matches are performed only when the previous match ends. So it is * preferable to spend more time in matches to allow very fast string * insertions and deletions. The matching algorithm for small strings * is inspired from that of Rabin & Karp. A brute force approach is * used to find longer strings when a small match has been found. * A similar algorithm is used in freeze (by Leonid Broukhis) but the * algorithm used here is faster. * A previous version of this file used a more sophisticated algorithm * (by Fiala and Greene) which is guaranteed to run in linear amortized * time, but has a larger average cost and uses more memory. However * the F&G algorithm may be faster for some highly redundant files if * the parameter max_chain_length (described below) is too large. * * ACKNOWLEDGEMENTS * * Rich Wales defined the interface, provided the necessary information * to ensure compatibility with pkunzip 1.0 (not an easy job) and * suggested the solution (n == 1 + n-1) adopted here. * The idea of lazy evaluation of matches is due to Jan Mark Wams, and * I found it in 'freeze' written by Leonid Broukhis. * Special thanks to Kai-Uwe Rommel for the OS/2 port, to Glenn J. * Andrews for the VMS port, and to many other info-zippers for testing. * * REFERENCES * * A description of the Rabin and Karp algorithm is given in the book * "Algorithms" by R. Sedgewick, Addison-Wesley, p252. * * Fiala,E.R., and Greene,D.H. * Data Compression with Finite Windows, CACM, 32,4 (1989) 490-595. * * INTERFACE * * ImpErr lm_init (int pack_level) * Initialize the "longest match" routines for a new file. * The global variable fd is an implicit parameter. * * ImpErr lm_input (U_CHAR *block, U_INT count) * Process a block of input characters. * * ImpErr lm_windup (void) * Flush out the remaining unprocessed input. */ #include "implode.h" /*********************************************************************** * * Configuration parameters */ #define MAX_MATCH_LENGTH 320 /* The maximum match length. 320 = 64 + 256. (If the length is greater than * 63, pkzip uses an extra byte.) */ #define MAX_WBITS 13 #define WSIZE (1 << MAX_WBITS) /* Maximum window size = 8K */ /* Constants used to dimension the hash table: */ #define HASH_BITS 14 /* HASH_BITS must be >= 13, see longest_match() */ #define HASH_SIZE (1<<HASH_BITS) #define HASH_MASK (HASH_SIZE-1) #if defined(MSDOS) || defined(i386) || defined(mc68020) || defined(vax) # define UNALIGNED_OK /* Define this symbol if your target allows access to unaligned data. * This is not mandatory, just a speed optimization. The compressed * output is strictly identical. */ #endif #ifdef __TURBOC__ # define DYN_ALLOC /* Turbo C 2.0 does not accept far static allocations in small model */ #endif /*********************************************************************** * * Local data used by the "longest match" routines. */ #if HASH_BITS <= 14 typedef unsigned short Hash; #else /* Defined just for safety, since values > 14 do not speed up implosion */ typedef unsigned long Hash; #endif typedef unsigned short Pos; typedef unsigned int IPos; /* A Pos is an index in the character window. We use short instead of int to * save space in the various tables. IPos is used only for parameter passing. */ int near min_match_length; /* Minimum match length, 2 for binary files, 3 for ascii files. * (bad luck for ebcdic users; not because they may not get optimal * compression, but because they have to use ebcdic machines :-) * A zero value means that the min_match_length is not yet determined. */ U_CHAR near window[MAX_MATCH_LENGTH + WSIZE + BSZ]; /* MAX_MATCH_LENGTH bytes are duplicated at both ends of the window, * to speed up string comparisons. The BSZ extra bytes allow a block copy * of the input buffer into the window instead of a copy one byte at a time. */ #define MAX_DIST (WSIZE + BSZ) /* Maximum theoretical distance between two distinct bytes in the window. * Actual distances are limited to bufsize. */ #define NIL MAX_DIST /* Tail of hash chains */ #ifdef DYN_ALLOC Hash far *next = NULL; Pos far *prev = NULL; #else Hash far next[MAX_DIST+1]; Pos far prev[MAX_DIST+HASH_SIZE+1]; #endif /* next is a link to a more recent string with same hash index, or to the head * of a hash table chain if there is no such string. next[NIL] is used to * avoid extra checks. next[s] is NIL if string s is not yet in the dictionary * * prev is a link to an older string with same hash index (first MAX_DIST * values) or head of hash chain (last HASH_SIZE values). prev[NIL] is used * to avoid extra checks. */ #define match_head (prev+(MAX_DIST+1)) Hash near ins_h; /* hash index of string to be inserted. */ int near h_shift; /* Number of bits by which ins_h must be shifted at each * input step. It must be such that after min_match_length steps, the oldest * byte no longer takes part in the hash key, that is: * h_shift * min_match_length >= HASH_BITS */ MATCH *ma_buf = NULL; /* Buffer used to speed up reading/writing to/from temp file */ #define MA_BUFEND (ma_buf+MA_BUFSIZE) MATCH *ma; /* Pointer to the most recent match. */ int near start_length; /* Matches not greater than this are discarded. This is used in the lazy match * evaluation. If start_length > 1, ma is a valid guess of length start_length * and ct_tally has not yet been called. */ int near strstart; /* start of string to insert */ int near strsize; /* length of string to insert */ int near match_length; /* length of current best match */ int near bufsize; /* # of slots in window */ int near checkpoint; /* look for new match at this point */ static int insert_point; /* position of next input buffer */ static int max_lazy_match; /* We try lazy evaluation only for matches of length 2..max_lazy_match, to * speed up the implosion. We use 0 for maximum speed, 0.9*MAX_MATCH_LENGTH * for maximum compression. */ int near max_chain_length; /* To speed up implosion, hash chains are truncated to this length. * A higher limit improves compression ratio but degrades the speed. * We use 40 for maximum speed, 960 for maximum compression. Values * below 20 are not recommended. */ /* Values for max_lazy_match and max_chain_length, depending on the desired * pack level (0..9). The values given below have been tuned to exclude * worst case performance for pathological files. Better values may be * found for specific files. Note that the current algorithm requires * max_lazy >= 2. */ typedef struct config { int max_lazy; int max_chain; } config; static config configuration_table[10] = { /* 0 */ {2, MAX_MATCH_LENGTH/8}, /* maximum speed */ /* 1 */ {4, MAX_MATCH_LENGTH/4}, /* 2 */ {5, MAX_MATCH_LENGTH/2}, /* 3 */ {MAX_MATCH_LENGTH/16, MAX_MATCH_LENGTH/2}, /* 4 */ {MAX_MATCH_LENGTH/16, 3*MAX_MATCH_LENGTH/4}, /* 5 */ {MAX_MATCH_LENGTH/16, MAX_MATCH_LENGTH}, /* 6 */ {MAX_MATCH_LENGTH/16, 3*MAX_MATCH_LENGTH/2}, /* 7 */ {MAX_MATCH_LENGTH/16, 2*MAX_MATCH_LENGTH}, /* 8 */ {9*MAX_MATCH_LENGTH/10, 2*MAX_MATCH_LENGTH}, /* 9 */ {9*MAX_MATCH_LENGTH/10, 3*MAX_MATCH_LENGTH}}; /* maximum compression */ #define MIN(a,b) ((a) <= (b) ? (a) : (b)) /* The arguments must not have side effects. */ #define EQUAL 0 /* result of strncmp for equal strings */ /* Prototypes for local functions */ static void set_min_match_length OF ((U_CHAR *block, U_INT count)); ImpErr write_match OF ((IPos ma_start, int ma_length)); IPos longest_match OF ((IPos cur_match)); ImpErr lm_process OF ((U_INT count)); /*********************************************************************** * * Initialize the "longest match" routines for a new file. * The global variable fd is an implicit parameter. */ ImpErr lm_init (pack_level) int pack_level; /* 0: best speed, 9: best compression, other: default */ { register int i; /* Validate the arguments */ bufsize = fd.fd_bufsize; strsize = MIN (fd.fd_strsize, MAX_MATCH_LENGTH); if (bufsize > WSIZE) return IM_BADARG; if (bufsize < 2 * strsize) return IM_BADARG; if (pack_level < 0 || pack_level > 9) return IM_BADARG; /* Make sure "bufsize" is a power of 2 */ if ((bufsize & (bufsize - 1)) != 0) return IM_BADARG; /* Use dynamic allocation if compiler does not like big static arrays: */ #ifdef DYN_ALLOC if (prev == NULL) { next = (Hash far*)farmalloc((U_INT)(MAX_DIST+9)*sizeof(Hash)); prev = (Pos far*) farmalloc((U_INT)(MAX_DIST+HASH_SIZE+9)*sizeof(Pos)); /* We allocate 16 extra bytes for the normalization under MSDOS */ if (prev == NULL || next == NULL) return IM_NOMEM; # if defined(MSDOS) && !defined(OS2) /* Normalize to pointers with offset 0 (required by asm version). * For OS/2, we can't of course play such nasty games. */ #define NORMALIZE(ptr) { \ *((int*)&ptr+1) += ((unsigned)(ptr-0) + 15) >> 4; \ *(int*)&ptr = 0; \ } NORMALIZE(prev); NORMALIZE(next); # endif } #endif /* DYN_ALLOC */ /* Initialize the hash tables. */ for (i = 0; i < HASH_SIZE; i++) match_head[i] = NIL; for (i = 0; i <= MAX_DIST; i++) next[i] = NIL; /* prev[0..MAX_DIST] will be initialized on the fly */ ins_h = 0; /* Assume strsize zeros before the input (bytes beyond strsize * can be garbage): */ memset((char*)window, 0, MAX_MATCH_LENGTH); /* It is not necessary to duplicate this at the end of the window. * Duplication will start only after the first wrap around. */ insert_point = MAX_MATCH_LENGTH; /* Force a check for the file type (ascii/binary) and set the default * configuration parameters: */ min_match_length = 0; max_lazy_match = configuration_table[pack_level].max_lazy; max_chain_length = configuration_table[pack_level].max_chain; /* Do not report matches before the first strsize strings have been * inserted in the suffix tree: */ strstart = 0; checkpoint = strsize; if (ma_buf == NULL) { ma_buf = (MATCH *) malloc ((unsigned) (MA_BUFSIZE * sizeof (MATCH))); if (ma_buf == NULL) return IM_NOMEM; } ma = ma_buf - 1; start_length = 1; /* All done. */ return IM_OK; } /*********************************************************************** * * Output the match info. * IN assertions: The matching strings start at strstart and ma_start * and have a length of ma_length bytes. * If ma_length is not greater than start_length, ma_start is garbage. * strstat == checkpoint. If start_length > 1, ma is the * previous match which has not yet been output. * OUT assertion: checkpoint is reset according to the match length * actually chosen. * ma is set to the current match, with start_length set appropriately. */ ImpErr write_match(ma_start, ma_length) IPos ma_start; /* start of matched string */ int ma_length; /* length of complete match */ { int ma_dist = 0; /* distance of current match */ /* ma_length can be too large towards the end of the input: */ if (ma_length > strsize) ma_length = strsize; #ifdef DEBUG /* check that the match is indeed a match */ if (ma_length > start_length && strncmp(window + ma_start, window + strstart, ma_length) != EQUAL) { fprintf(stderr, "write_match: ma_start %d, strstart %d, ma_length %d\n", ma_start, strstart, ma_length); exit(1); } #endif /* PKUNZIP accepts most overlapping matches. However, when the * distance has the value 1, versions of PKUNZIP prior to 1.10 don't * handle the overlap properly -- and version 1.10 handles the * overlap correctly only if the length is limited to 62 plus the * minimum match length; i.e., only if there is no supplementary * length byte. (From phone conversation with Phil Katz, 23 January * 1991.) The compression ratio is generally better when we do not * limit the match length to 64, so we remove distance-one matches * completely. (But PKUNZIP 1.01 also rejects some distance-two matches. * This could be fixed but would degrade compression.) */ if (ma_length > 1) { ma_dist = strstart - ma_start; if (ma_dist < 0) ma_dist += MAX_DIST; if (ma_dist == 1) { /* keep the previous match if it was delayed */ if (start_length > 1) { ma_length = 1; } else { /* Truncate the match to 1 */ ImpErr retcode = write_match(ma_start, 1); if (retcode != IM_OK) return retcode; /* Emit a match with a distance of two and a length reduced by * one. This reduced match may be delayed. */ checkpoint = ++strstart; retcode = write_match(ma_start, ma_length-1); strstart--; return retcode; /* Leave checkpoint unchanged */ } /* start_length > 1 */ } /* ma_dist == 1 */ } /* ma_length > 1 */ /* If the previous match has been delayed, keep it or prefer the * current match: */ if (start_length > 1) { /* Keep the previous match if it is not shorter than the current one. * Otherwise, emit only the first byte of the previous match, * followed by the current match. If we have a delayed match for * the last bytes of the input file, the next match will necessarily * be smaller, so ct_tally will correctly be called for the delayed * match. */ if (start_length >= ma_length) { /* Keep the previous match */ if (start_length == 2) { ma->ma_dist = - ma->ma_dist; ma->l.ma_litc[1] = window[strstart]; /* litc[0] already set */ } else { ma->l.ma_length = start_length; /* overwrite ma->l.ma_litc */ } checkpoint = strstart + start_length - 1; start_length = 1; return ct_tally (ma); } /* Shorten the previous match to zero */ ma->ma_dist = 0; /* keep ma->l.ma_litc */ start_length = 1; (void) ct_tally (ma); /* ignore result, ct_tally cannot fail */ } if (++ma == MA_BUFEND) { ma = ma_buf; if (twrite ((char *) ma, sizeof(MATCH), MA_BUFSIZE, fd.fd_temp) != MA_BUFSIZE) return IM_IOERR; } /* Keep the current match as guess only if its length is small, * trying to find a better match at the next step. If speed is not * critical, we use this lazy mechanism for all lengths. */ if (ma_length > 1) { ma->ma_dist = ma_dist; if (ma_length <= max_lazy_match) { /* Set ma_litc[0]: this is the only way to identify the unmatched * data if the delayed match will be truncated to 1. It is also * useful if ma_length == 2: it may be more efficient in this case * to encode the individual characters rather than the match info. */ ma->l.ma_litc[0] = window[strstart]; start_length = ma_length; checkpoint = strstart + 1; return IM_OK; } /* At this point, ma_length >= 3, no need for ma_litc */ ma->l.ma_length = ma_length; checkpoint = strstart + ma_length; } else { ma->ma_dist = 0; ma->l.ma_litc[0] = window[strstart]; /* ma_litc[1] is not required */ checkpoint = strstart + 1; } return ct_tally (ma); /* Keep start_length == 1 */ } /*********************************************************************** * * Determine the minimum match length, based on the type of data * in the given input buffer: 2 for binary data, 3 otherwise. Set also * h_shift according to the chosen min_match_length, and reduce * max_chain_length for binary files. * If the guess about data type is wrong, this only affects the * compression ratio and speed but not the correctness of the algorithms. * If there are more than 20% bytes which seem non ascii in the first * 500 bytes, we assume that the data is binary. (We accept data * with a few high bits set as ascii to take into account special * word processor formats.) */ static void set_min_match_length (block, count) U_CHAR *block; /* input data */ U_INT count; /* # of input char's */ { int non_ascii = 0; min_match_length = 3; /* Default ascii */ if (count >= 500) { count = 500; while (--count != 0) { if (*block <= 6 || *block >= 0x80) non_ascii++; block++; } if (non_ascii > 100) { min_match_length = 2; max_chain_length >>= 2; } } h_shift = (HASH_BITS+min_match_length-1)/min_match_length; #ifdef DEBUG fprintf(stderr," (min_match_length %d) ", min_match_length); #endif } /*********************************************************************** * * Insert string s in the dictionary and set last_match to the previous head * of the hash chain (the most recent string with same hash key). * IN assertion: all calls to to INSERT_STRING are made with consecutive * input characters, so that a running hash key can be computed from the * previous key instead of complete recalculation each time. */ #define INSERT_STRING(s, last_match) \ { \ ins_h = ((ins_h<<h_shift) ^ window[s + min_match_length-1]) & HASH_MASK; \ prev[s] = last_match = match_head[ins_h]; \ next[last_match] = prev[next[s] = ins_h + MAX_DIST+1] = s; \ } /* next[NIL] is garbage, we can overwrite it if s is a tail */ /*********************************************************************** * * Remove string s from the dictionary, or do nothing if s is not yet * in the dictionary. * IN assertion: s is the tail of its hash chain (the oldest string). */ #define DELETE_STRING(s) {prev[next[s]] = NIL;} /* No effect if next[s] == NIL (s not in dictionary) */ /*********************************************************************** * * Find the longest match starting at the given string. Return its position * and set its length in match_length. Matches shorter or equal to * start_length are discarded, in which case match_length is unchanged * and the result position is NIL. * IN assertions: cur_match is the head of the hash chain for the current * string (strstart) and is not NIL, and start_length >= 1 */ #if !defined(MSDOS) || defined(NO_ASM) /* For MSDOS, a version of this routine written in assembler is in im_lm.asm. * The algorithms are strictly equivalent, so the C version can be used * instead if you do not have masm or tasm. (Update the makefile in this case.) */ IPos longest_match(cur_match) IPos cur_match; { register U_CHAR *match; /* pointer in matched string */ register U_CHAR *scan = window + strstart;/* pointer in current string */ register int len; /* length of current match */ IPos cur_best = NIL; /* best match so far */ register int ma_length = start_length; /* best match length so far */ int chain_count = max_chain_length; /* used to limit hash chains */ typedef unsigned short US; typedef unsigned long UL; #ifdef UNALIGNED_OK register US scan_start = *(US*)scan; register US scan_end = *(US*)(scan+ma_length-1); #else register U_CHAR scan_start = *scan; register U_CHAR scan_end1 = scan[ma_length-1]; register U_CHAR scan_end = scan[ma_length]; #endif do { match = window + cur_match; /* Skip to next match if the match length cannot increase * or if the match length is less than 2: */ #ifdef UNALIGNED_OK /* This code assumes sizeof(unsigned short) == 2 and * sizeof(unsigned long) == 4. Do not use UNALIGNED_OK if your * compiler uses different sizes. */ if (*(US*)(match+ma_length-1) != scan_end || *(US*)match != scan_start) continue; len = min_match_length - 4; /* If min_match_length == 3, it is not necessary to compare * scan[2] and match[2] since they are always equal when the other * bytes match, given that the hash keys are equal and that * HASH_BITS >= 8. */ # define ML MAX_MATCH_LENGTH do {} while ((len+=4) < ML && *(UL*)(scan+len) == *(UL*)(match+len)); if (*(US*)(scan+len) == *(US*)(match+len)) len += 2; if (scan[len] == match[len]) len++; #else /* UNALIGNED_OK */ if (match[ma_length] != scan_end || match[ma_length-1] != scan_end1 || *match != scan_start) continue; /* It is not necessary to compare scan[1] and match[1] since they * are always equal when the other bytes match, given that * the hash keys are equal and that h_shift+8 <= HASH_BITS, * that is, when the last byte is entirely included in the hash key. * The condition is equivalent to * (HASH_BITS+2)/3 + 8 <= HASH_BITS * or: HASH_BITS >= 13 (see set_min_match_length()). * Also, we check for a match at ma_length-1 to get rid quickly of * the match with the suffix of the match made at the previous step, * which is known to fail. */ len = 1; do {} while (++len < MAX_MATCH_LENGTH && scan[len] == match[len]); #endif /* UNALIGNED_OK */ if (len > ma_length) { cur_best = cur_match, ma_length = len; if (len >= strsize) break; #ifdef UNALIGNED_OK scan_end = *(US*)(scan+ma_length-1); #else scan_end1 = scan[ma_length-1]; scan_end = scan[ma_length]; #endif } } while (--chain_count != 0 && (cur_match = prev[cur_match]) != NIL); if (ma_length > start_length) match_length = ma_length; return cur_best; } #endif /* MSDOS */ /*********************************************************************** * * Process a block of input characters, generating zero or more match * info records as appropriate. * IN assertion: count <= BSZ */ ImpErr lm_input (block, count) U_CHAR *block; /* input data */ U_INT count; /* # of input char's */ { if (count == 0) return IM_OK; /* Determine the input file type if this is the first call */ if (min_match_length == 0) set_min_match_length (block, count); if (insert_point + count <= sizeof(window)) { memcpy((char*)window + insert_point, (char*)block, count); } else { int remain = sizeof(window)-insert_point; memcpy((char*)window + insert_point, (char*)block, remain); memcpy((char*)window + MAX_MATCH_LENGTH, (char*)block + remain, count - remain); } insert_point += count; if (insert_point > MAX_DIST) { /* Duplicate the end of the window */ memcpy((char*)window, (char*)window + MAX_DIST, MIN (insert_point - MAX_DIST, MAX_MATCH_LENGTH)); } if (insert_point >= sizeof(window)) insert_point -= MAX_DIST; return lm_process(count); } /*********************************************************************** * * Process a block of characters already inserted in the window * IN assertion: count > 0 */ #if !defined(MSDOS) || defined(NO_ASM) ImpErr lm_process (count) U_INT count; /* number of bytes to process */ { ImpErr retcode; /* as usual */ IPos cur_match; /* starting point for longest match search */ IPos best_match = NIL; /* longest match found */ int delete_point; /* position of next string to remove */ delete_point = strstart - bufsize + MAX_MATCH_LENGTH - 1; if (delete_point < 0) delete_point += MAX_DIST; /* Process the input block. */ do { /* Insert the string window[strstart .. strstart+strsize-1] in the * dictionary, and set cur_match to the head of the hash chain: */ INSERT_STRING(strstart, cur_match); if (strstart == checkpoint) { /* Find the longest match, discarding those <= start_length */ match_length = 0; if (cur_match != NIL) { best_match = longest_match (cur_match); /* longest_match updates match_length if longer match found */ } retcode = write_match (best_match, match_length); if (retcode != IM_OK) return retcode; } /* Remove the oldest string from the dictionary, except if we have not * yet created bufsize dictionary entries. We could avoid this * deletion and check instead for obsolete pointers in * longest_match(), but this would be slower. */ #if (MAX_DIST & (MAX_DIST-1)) != 0 if (++delete_point == MAX_DIST) delete_point = 0; #else delete_point = (delete_point + 1) & (MAX_DIST-1); #endif DELETE_STRING (delete_point); if (++strstart == MAX_DIST) { strstart = 0, checkpoint -= MAX_DIST; } } while (--count != 0); return IM_OK; } #endif /* MSDOS */ /*********************************************************************** * * Wind up processing by flushing unprocessed input. For normal processing, * this routine is called twice (by imp_size then imp_clear) and the * second call does nothing. In case of error, this routine is called only * by imp_clear(). */ ImpErr lm_windup() { ImpErr retcode; int matches; /* Process the remaining input. */ while (strsize > 0) { retcode = lm_process (1); if (retcode != IM_OK) return retcode; --strsize; } /* Flush the match buffer. */ if ((matches = ma-ma_buf+1) != 0 && matches != twrite ((char *) ma_buf, sizeof(MATCH), matches, fd.fd_temp)) { return IM_IOERR; } ma = ma_buf - 1; return IM_OK; }