Source Code 1992 March

home *** CD-ROM | disk | FTP | other *** search

/ Source Code 1992 March / Source_Code_CD-ROM_Walnut_Creek_March_1992.iso / msdos / filutl / beyondc.arc / COMPARE.C < prev next >

Wrap

C/C++ Source or Header | 1987-09-01 | 16.3 KB | 489 lines

/* ** Copyright (c) 1987, Tom Steppe. All rights reserved. ** ** This module compares two arrays of lines (representing ** files) and reports the sequences of consecutive matching ** lines between them using the "recursive longest matching ** sequence" algorithm. This is useful for implementing a ** file comparison utility. ** ** Compiler: Microsoft (R) C Compiler Version 4.00 */ #include <stdio.h> #include <ctype.h> #include <string.h> #include <malloc.h> /* Boolean type and values. */ typedef int BOOLEAN; #define TRUE 1 #define FALSE 0 /* Minimum macro. */ #define min(x, y) (((x) <= (y)) ? (x) : (y)) /* Value to indicate identical strings with strcmp. */ #define ALIKE 0 /* Result of hashing function for a line of text. */ typedef unsigned int HASH; /* Mask for number of bits in hash code. (12 bits). */ #define MASK (unsigned int) 0x0FFF /* Number of possible hash codes. */ #define HASHSIZ (MASK + 1) /* Information about an entry in a hash table. */ typedef struct tblentry { int frst; /* First line # with this hash code. */ int last; /* Last line # with this hash code. */ } TBLENTRY; /* Information about a line of text. */ typedef struct lineinf { HASH hash; /* Hash code value. */ int nxtln; /* Next line with same hash (or 0). */ } LINEINF; /* Information about a file. */ typedef struct fileinf { char **txt; /* Array of lines of text. */ LINEINF *line; /* Array of line info structs. */ TBLENTRY *hashtbl; /* Hash table. */ } FILEINF; /* Function declarations. */ BOOLEAN filcmp (char **, int, char **, int, int); BOOLEAN get_inf (char **, int, FILEINF *); HASH calc_hash (char *); void fnd_seq (FILEINF *, int, int, FILEINF *, int, int, int); BOOLEAN chk_hashes (LINEINF *, LINEINF *, int); int cnt_matches (char **, char **, int); void rpt_seq (int, int, int); /*********************************************************** ** compare compares two arrays of lines and reports the ** sequences of consecutive matching lines. The zeroth ** element of each array is unused so that the index into ** the array is identical to the associated line number. ** ** RETURNS: TRUE if comparison succeeded. ** FALSE if not enough memory. ***********************************************************/ BOOLEAN compare (a1, n1, a2, n2, lngval) char **a1; /* (I) Array of lines of text in #1. */ int n1; /* (I) Number of lines in a1. (Does not count 0th element.) */ char **a2; /* (I) Array of lines of text in #2. */ int n2; /* (I) Number of lines in a2. (Does not count 0th element.) */ int lngval; /* (I) "Long enough" value. */ { FILEINF f1; /* File information for #1. */ FILEINF f2; /* File information for #2. */ BOOLEAN rtn; /* Return value. */ /* Gather information for each file, then compare. */ if (rtn = (get_inf (a1, n1, &f1) && get_inf (a2, n2, &f2))) { fnd_seq (&f1, 1, n1, &f2, 1, n2, lngval); } return (rtn); } /*********************************************************** ** get_inf calculates hash codes and builds a hash table. ** ** RETURNS: TRUE if get_inf succeeded. ** FALSE if not enough memory. ***********************************************************/ static BOOLEAN get_inf (a, n, f) char **a; /* (I) Array of lines of text. */ int n; /* (I) Number of lines in a. */ FILEINF *f; /* (O) File information. */ { unsigned int size; /* Size of hash table. */ register int i; /* Counter. */ TBLENTRY *entry; /* Entry in hash table. */ /* Assign the array of text. */ f->txt = a; /* Allocate and initialize a hash table. */ size = HASHSIZ * sizeof (TBLENTRY); if (f->hashtbl = (TBLENTRY *) malloc (size)) { memset ((char *) f->hashtbl, '\0', size); } else { return (FALSE); } /* If there are any lines: */ if (n > 0) { /* Allocate an array of line structures. */ if (f->line = (LINEINF *) malloc ((n + 1) * sizeof (LINEINF *))) { /* Loop through the lines. */ for (i = 1; i <= n; i++) { /* Calculate the hash code value. */ f->line[i].hash = calc_hash (f->txt[i]); /* Locate the entry in the hash table. */ entry = f->hashtbl + f->line[i].hash; /* Update the linked list of lines with */ /* the same hash code. */ f->line[entry->last].nxtln = i; f->line[i].nxtln = 0; /* Update the first and last line */ /* information in the hash table. */ if (entry->frst == 0) { entry->frst = i; } entry->last = i; } } else { return (FALSE); } } else { f->line = NULL; } return (TRUE); } /*********************************************************** ** calc_hash calculates a hash code for a line of text. ** ** RETURNS: a hash code value. ***********************************************************/ static HASH calc_hash (buf) char *buf; /* (I) Line of text. */ { register unsigned int chksum; /* Checksum. */ char *s; /* Pointer. */ HASH hash; /* Hash code value. */ /* Build up a checksum of the characters in the text. */ for (chksum = 0, s = buf; *s; chksum ^= *s++) { ; } /* Combine the 7-bit checksum and as much of the */ /* length as is possible. */ hash = ((chksum & 0x7F) | ((s - buf) << 7)) & MASK; return (hash); } /*********************************************************** ** Given starting and ending line numbers, fnd_seq finds a ** "good sequence" of lines within those ranges. fnd_seq ** then recursively finds "good sequences" in the sections ** of lines above the "good sequence" and below it. ***********************************************************/ static void fnd_seq (f1, beg1, end1, f2, beg2, end2, lngval) FILEINF *f1; /* (I) File information for #1. */ int beg1; /* (I) First line # to compare in #1. */ int end1; /* (I) Last line # to compare in #1. */ FILEINF *f2; /* (I) File information for #2. */ int beg2; /* (I) First line # to compare in #2. */ int end2; /* (I) Last line # to compare in #2. */ int lngval; /* (I) "Long enough" value. */ { LINEINF *line1; /* Line information ptr in #1. */ LINEINF *line2; /* Line information ptr in #2. */ register int limit; /* Looping limit. */ int ln1; /* Line number in #1. */ int ln2; /* Line number in #2. */ register int ln; /* Working line number. */ BOOLEAN go; /* Continue to loop? */ int most; /* Longest possible seq. */ int most1; /* Longest possible due to #1. */ int most2; /* Longest possible due to #2. */ int cnt; /* Length of longest seq. */ int oldcnt; /* Length of prev longest seq. */ int n; /* Length of cur longest seq. */ int m1; /* Line of longest seq. in #1. */ int m2; /* Line of longest seq. in #2. */ /* Initialize. */ go = TRUE; line1 = f1->line; line2 = f2->line; /* Initialize longest sequence information. */ cnt = 0; /* Length of longest seq. */ m1 = beg1 - 1; /* Line # of longest seq. in #1. */ m2 = beg2 - 1; /* Line # of longest seq. in #2. */ oldcnt = 0; /* Length of prev longest seq. */ /* Calculate maximum possible number of consecutive */ /* lines that can match (based on line # ranges). */ most1 = end1 - beg1 + 1; most2 = end2 - beg2 + 1; /* Scan lines looking for a "good sequence". ** Compare lines in the following order of line numbers: ** ** (1, 1) ** (1, 2), (2, 1), (2, 2) ** (1, 3), (2, 3), (3, 1), (3, 2), (3, 3) ** etc. */ for (ln1 = beg1, ln2 = beg2; TRUE; ln1++, ln2++) { if (ln2 <= end2 - cnt) /* There are enough lines left in #2 such that it */ /* is possible to find a longer sequence. */ { /* Determine the limit in #1 that both */ /* enforces the order scheme and still makes */ /* it possible to find a longer sequence. */ limit = min (ln1 - 1, end1 - cnt); /* Calculate first potential match in #1. */ for (ln = f1->hashtbl[line2[ln2].hash].frst; ln && ln < beg1; ln = line1[ln].nxtln) { ; } /* Loop through the lines in #1. */ for (; ln && ln <= limit; ln = line1[ln].nxtln) { if (line1[ln].hash == line2[ln2].hash && line1[ln + cnt].hash == line2[ln2 + cnt].hash && !(ln - m1 == ln2 - m2 && ln < m1 + cnt && m1 != beg1 - 1)) /* A candidate for a longer sequence has */ /* been located. The current lines */ /* match, the current lines + cnt match, */ /* and this sequence is not a subset of */ /* the longest sequence so far. */ { /* Calculate most possible matches. */ most = min (end1 - ln + 1, most2); /* First compare hash codes. If the */ /* number of matches exceeds the */ /* longest sequence so far, then */ /* compare the actual text. */ if (chk_hashes (line1 + ln, line2 + ln2, cnt) && (n = cnt_matches (f1->txt + ln, f2->txt + ln2, most)) > cnt) /* This is the longest seq. so far. */ { /* Update longest sequence info. */ oldcnt = cnt; cnt = n; m1 = ln; m2 = ln2; /* If it's long enough, end the */ /* search. */ if (cnt >= lngval) { break; } /* Update limit, using new count. */ limit = min (ln1 - 1, end1 - cnt); } } } /* If it's long enough, end the search. */ if (cnt >= lngval) { break; } most2--; } else { go = FALSE; /* This file is exhausted. */ } /* Repeat the process for the other file. */ if (ln1 <= end1 - cnt) { limit = min (ln2, end2 - cnt); for (ln = f2->hashtbl[line1[ln1].hash].frst; ln && ln < beg2; ln = line2[ln].nxtln) { ; } for (; ln && ln <= limit; ln = line2[ln].nxtln) { if (line1[ln1].hash == line2[ln].hash && line1[ln1 + cnt].hash == line2[ln+ cnt].hash && !(ln1 - m1 == ln - m2 && ln1 < m1 + cnt && m2 != beg2 - 1)) { most = min (end2 - ln + 1, most1); if (chk_hashes (line1 + ln1, line2 + ln, cnt) && (n = cnt_matches (f1->txt + ln1, f2->txt + ln, most)) > cnt) { oldcnt = cnt; cnt = n; m1 = ln1; m2 = ln; if (cnt >= lngval) { break; } limit = min (ln2, end2 - cnt); } } } if (cnt >= lngval) { break; } most1--; } else if (!go) { break; /* This file is exhausted, also. */ } } /* If the longest sequence is shorter than the "long */ /* enough" value, the "long enough" value can be */ /* adjusted for the rest of the comparison process. */ if (cnt < lngval) { lngval = cnt; } if (cnt >= 1) /* Longest sequence exceeds minimum necessary size. */ { if (m1 != beg1 && m2 != beg2 && oldcnt > 0) /* There is still something worth comparing */ /* previous to the sequence. */ { /* Use knowledge of the previous longest seq. */ fnd_seq (f1, beg1, m1 - 1, f2, beg2, m2 - 1, oldcnt); } /* Report the sequence. */ rpt_seq (m1, m2, cnt); if (m1 + cnt - 1 != end1 && m2 + cnt - 1 != end2) /* There is still something worth comparing */ /* subsequent to the sequence. */ { fnd_seq (f1, m1 + cnt, end1, f2, m2 + cnt, end2, lngval); } } } /*********************************************************** ** chk_hashes determines whether this sequence of matching ** hash codes is longer than cnt. It knows that the first ** pair of hash codes is guaranteed to match. ** ** RETURNS: TRUE if this sequence is longer than cnt. ** FALSE if this sequence is not longer than cnt. ***********************************************************/ static BOOLEAN chk_hashes (line1, line2, cnt) LINEINF *line1; /* (I) Line information for #1. */ LINEINF *line2; /* (I) Line information for #2. */ register int cnt; /* (I) Count to try to exceed. */ { register int n; /* Count of consecutive matches. */ for (n = 1; n <= cnt && ((++line1)->hash == (++line2)->hash); n++) { ; } return (n > cnt); } /*********************************************************** ** cnt_matches counts the number of consecutive matching ** lines of text. ** ** RETURNS: number of consecutive matching lines. ***********************************************************/ static int cnt_matches (s1, s2, most) char **s1; /* (I) Starting line in file #1. */ char **s2; /* (I) Starting line in file #2. */ register int most; /* (I) Most matching lines possible. */ { register int n; /* Count of consecutive matches. */ /* Count the consecutive matches. */ for (n = 0; n < most && strcmp (*s1++, *s2++) == ALIKE; n++) { ; } return (n); } /*********************************************************** ** rpt_seq reports a matching sequence of lines. ***********************************************************/ static void rpt_seq (m1, m2, cnt) int m1; /* (I) Location of matching sequence in #1. */ int m2; /* (I) Location of matching sequence in #2. */ int cnt; /* (I) Number of lines in matching sequence. */ { fprintf (stdout, "Matched %5d lines: (%5d - %5d) and (%5d - %5d)\n", cnt, m1, m1 + cnt - 1, m2, m2 + cnt - 1); }