home *** CD-ROM | disk | FTP | other *** search
- From: Dick Grune <talcott!seismo!mcvax!vu44!dick>
- Subject: Software similarity tester for C programs
- Newsgroups: mod.sources
- Approved: jpn@panda.UUCP
-
- Mod.sources: Volume 3, Issue 119
- Submitted by: Dick Grune <talcott!seismo!mcvax!vu44!dick>
-
-
- The enclosed shar-archive contains a program that will detect
- stretches in C-programs that look similar (or are just plain equal).
- This is useful for finding "borrowed" soft-ware or for isolating
- possible subroutines in large software systems. It is very fast and
- gives results. We have been using it for about half a year now.
-
- Dick Grune
- Vrije Universiteit
- de Boelelaan 1081
- 1081 HV Amsterdam
- the Netherlands
- ..!mcvax!vu44!dick
- dick@vu44.UUCP
-
-
-
- : This is a shar archive. Extract with sh, not csh.
- : This archive ends with exit, so do not worry about trailing junk.
- : --------------------------- cut here --------------------------
- PATH=/bin:/usr/bin
- echo Extracting \R\E\A\D\_\M\E
- sed 's/^X//' > \R\E\A\D\_\M\E << '+ END-OF-FILE '\R\E\A\D\_\M\E
- XSat Jan 11 15:47:16 1986
- X
- XThis program tests for similar (or equal) stretches in one or more C-programs.
- XSee sim.1
- X
- XTo compile, call "make", which will generate one executable called sim, and
- Xwill run two small tests to show sample output.
- X
- XTo install, examine the Makefile and reset BINDIR and MANDIR to sensible
- Xpaths, and call "make install"
- X
- XTo change the default run size or the page width, adjust the file params.h
- Xand recompile.
- X
- XTo add another language X, write a file X.l along the lines of clang.l,
- Xextend the Makefile and recompile. All knowledge about C in located in
- Xclang.l; the rest of the programs expect each C lexical unit to be a single
- Xcharacter.
- X
- X Dick Grune
- X Vrije Universiteit
- X de Boelelaan 1081
- X 1081 HV Amsterdam
- X the Netherlands
- X ..!mcvax!vu44!dick
- X dick@vu44.UUCP
- + END-OF-FILE READ_ME
- chmod 'u=rw,g=r,o=r' \R\E\A\D\_\M\E
- set `sum \R\E\A\D\_\M\E`
- sum=$1
- case $sum in
- 26812) :;;
- *) echo 'Bad sum in '\R\E\A\D\_\M\E >&2
- esac
- echo Extracting \M\a\k\e\f\i\l\e
- sed 's/^X//' > \M\a\k\e\f\i\l\e << '+ END-OF-FILE '\M\a\k\e\f\i\l\e
- X# This file is part of the software similarity tester SIM.
- X# Written by Dick Grune, Vrije Universiteit, Amsterdam.
- X#
- X
- XBINDIR = /user1/dick/bin# # where to put the binary (sim)
- XMANDIR = /user1/dick/man# # where to put the manual page (sim.1)
- X
- X#
- X# Each module (set of programs that together perform some function)
- X# has the following sets defined for it:
- X# FLS all files of that module, for, e.g.,
- X# printing, sharring, inventory, etc.
- X# SRC the source files, from which other files derive
- X# CFS the C-files, from which the object files derive
- X# OBJ objects
- X# GRB garbage produced by compiling the module
- X#
- X# (This is a feeble attempt at software-engineering a Makefile.)
- X#
- X
- XTEST = pass3.c# # guinea pig
- X
- Xall: sim.res lang.res
- X
- X# The C Language module
- XCLN_OBJ = clang.o
- XCLN_CFS = clang.c
- XCLN_SRC = clang.l
- XCLN_FLS = $(CLN_SRC)
- X
- Xclang.c: clang.l
- X lex -t clang.l >clang.c
- XCLN_GRB = clang.c
- X
- X# Common modules:
- XCOM_OBJ = stream.o idf.o buff.o error.o
- XCOM_CFS = stream.c idf.c buff.c error.c
- XCOM_SRC = $(COM_CFS)
- XCOM_FLS = stream.h idf.h buff.h $(COM_SRC)
- X
- X# The top-package:
- XTOP_OBJ = top.o
- XTOP_CFS = top.c
- XTOP_SRC = $(TOP_CFS)
- XTOP_FLS = top.p top.g top.h $(TOP_SRC)
- X
- X# The similarity tester:
- XSIM_OBJ = sim.o pass1.o hash.o compare.o add_run.o pass2.o pass3.o
- XSIM_CFS = sim.c pass1.c hash.c compare.c add_run.c pass2.c pass3.c
- XSIM_SRC = $(SIM_CFS)
- XSIM_FLS = params.h text.h debug.h $(SIM_SRC)
- X
- XSIM = $(CLN_OBJ) $(COM_OBJ) $(TOP_OBJ) $(SIM_OBJ)
- XCFS = $(CLN_CFS) $(COM_CFS) $(TOP_CFS) $(SIM_CFS)
- X
- Xsim: $(SIM)
- X $(CC) $(SIM) -o sim
- X
- Xsim.res: sim $(TEST)
- X sim -hr 20 $(TEST)
- X
- Xlint: $(CFS)
- X lint -xa $(CFS)
- XSIM_GRB = sim
- X
- X# The language streamliner as a main program:
- XSTR_OBJ = main.o
- XSTR_CFS = main.c
- XSTR_SRC = $(STR_CFS)
- XSTR_FLS = $(STR_SRC)
- X
- XLANG_CFS = $(CLN_CFS) $(STR_CFS) $(COM_CFS)
- XLANG_OBJ = $(CLN_OBJ) $(STR_OBJ) $(COM_OBJ)
- Xlang: $(LANG_OBJ)
- X $(CC) $(LANG_OBJ) -o lang
- X
- Xlang.res: lang $(TEST)
- X lang -1 $(TEST) >lang1.res
- X lang -2 $(TEST) >lang2.res
- X wc lang[12].res $(TEST)
- XLANG_GRB = lang lang1.res lang2.res
- X
- Xlang.lint:
- X lint -xa $(LANG_CFS)
- X
- X# various other entries
- XFLS = READ_ME Makefile sim.1 \
- X $(COM_FLS) $(TOP_FLS) $(SIM_FLS) $(STR_FLS) $(CLN_FLS)
- XSRC = $(COM_SRC) $(TOP_SRC) $(SIM_SRC) $(STR_SRC) $(CLN_SRC)
- XOBJ = $(COM_OBJ) $(TOP_OBJ) $(SIM_OBJ) $(STR_OBJ) $(CLN_OBJ)
- X
- Xprint: $(FLS)
- X pr $(FLS) >print
- XPRINT_GRB = print
- X
- Xshar: $(FLS)
- X shar $(FLS) >shar
- X
- Xfiles: Makefile
- X ls $(FLS) >files
- X
- Xcchk:
- X cchk $(CFS)
- X
- Xsimsim: sim
- X sim -hfr 20 $(SRC)
- X
- Xsimsimx: sim
- X sim -hfxr 20 $(SRC)
- X
- Xtags: $(SRC)
- X ctags $(SRC)
- X
- Xinstall: $(BINDIR)/sim $(MANDIR)/sim.1
- X
- X$(BINDIR)/sim: sim
- X cp sim $(BINDIR)/sim
- X
- X$(MANDIR)/sim.1: sim.1
- X cp sim.1 $(MANDIR)/sim.1
- X
- Xclean:
- X rm -f $(OBJ) $(CLN_GRB) $(SIM_GRB) $(LANG_GRB) $(PRINT_GRB) \
- X a.out core
- X
- X#------------------------------------------------------------------------
- Xadd_run.o: buff.h text.h top.p top.h
- Xbuff.o: buff.h
- Xclang.o: idf.h stream.h
- Xcompare.o: buff.h text.h top.p top.h
- Xhash.o: buff.h text.h
- Xidf.o: idf.h
- Xpass1.o: buff.h text.h
- Xpass2.o: text.h top.p top.h debug.h
- Xpass3.o: params.h text.h top.p top.h debug.h buff.h
- Xsim.o: params.h
- Xstream.o: stream.h
- Xtop.o: text.h top.p top.h top.g
- + END-OF-FILE Makefile
- chmod 'u=rw,g=r,o=r' \M\a\k\e\f\i\l\e
- set `sum \M\a\k\e\f\i\l\e`
- sum=$1
- case $sum in
- 32570) :;;
- *) echo 'Bad sum in '\M\a\k\e\f\i\l\e >&2
- esac
- echo Extracting \s\i\m\.\1
- sed 's/^X//' > \s\i\m\.\1 << '+ END-OF-FILE '\s\i\m\.\1
- X.\" This file is part of the software similarity tester SIM.
- X.\" Written by Dick Grune, Vrije Universiteit, Amsterdam.
- X.\"
- X.TH SIM I
- X.SH NAME
- Xsim \- find similarities in C-files
- X.SH SYNOPSIS
- X.B sim
- X[
- X.B \-[fns]
- X.BI \-r N
- X]
- Xfile ...
- X.SH DESCRIPTION
- X.I Sim
- Xreads the C-files
- X.I file ...
- Xand looks for pieces of text that are similar; two pieces of C-text
- Xare similar if they only differ in layout, comment, identifiers and
- Xthe contents of numbers, strings and characters. If any runs
- Xof sufficient length
- Xare found, they are reported on standard output; the default length
- Xminimum is 24, but can be reset by the
- X.BR \-r -parameter.
- X.PP
- XThe program can be used for finding copied pieces of code in
- Xpurportedly unrelated programs (with the
- X.BR \-s -flag),
- Xor for finding accidentally duplicated code in larger projects
- X(without the
- X.BR \-s -flag
- Xbut with the
- X.BR \-f -flag).
- X.PP
- XSince it reads the files several times, it cannot read from standard input.
- X.PP
- XThere are the following options:
- X.TP
- X.B \-f
- XRuns are restricted to pieces with balancing parentheses, to isolate
- Xpotential functions.
- X.TP
- X.B \-n
- XSimilarities found are only summarized, not displayed.
- X.TP
- X.B \-s
- XThe contents of a file are not compared to itself (\-s = not self).
- X.PP
- XThe matching process uses a hash table so that tens of thousands of
- Xlines are processed in a few minutes; if, however, there is not
- Xenough memory for the table, the matching process uses sequential
- Xsearch, which can take hours.
- X.SH AUTHOR
- XDick Grune, Vrije Universiteit, Amsterdam.
- X.SH BUGS
- XStrong periodicity in the input text (like a table of
- X.I N
- Xalmost identical lines) causes problems.
- X.I Sim
- Xtries to cope with this but cannot avoid giving appr.
- X.I log N
- Xmessages about it. The best advice is still to take the offending
- Xfiles out of the game.
- + END-OF-FILE sim.1
- chmod 'u=rw,g=r,o=r' \s\i\m\.\1
- set `sum \s\i\m\.\1`
- sum=$1
- case $sum in
- 56000) :;;
- *) echo 'Bad sum in '\s\i\m\.\1 >&2
- esac
- echo Extracting \s\t\r\e\a\m\.\h
- sed 's/^X//' > \s\t\r\e\a\m\.\h << '+ END-OF-FILE '\s\t\r\e\a\m\.\h
- X/* This file is part of the software similarity tester SIM.
- X Written by Dick Grune, Vrije Universiteit, Amsterdam.
- X*/
- X
- X/*
- X Interface between the language-dependent lex module and
- X the stream module.
- X*/
- X
- X/* communication variables */
- Xextern int lex_no; /* pass 1: return stream of C condensed chars */
- X /* pass 2: return C-char/ASCII-char position
- X pairs at each \n */
- Xextern int lex_ch; /* condensed C-char produced by pass 1 */
- Xextern unsigned int lex_ch_cnt;
- X /* C-char position reported at each \n by pass 2 */
- Xextern long lex_ls_pos; /* lseek position reported at each \n by pass 2 */
- X
- X/* #defines for the lex module */
- X#define cput(ch) if (lex_ch_cnt++, lex_no == 1) \
- X {lex_ch = ch; return 1;} else
- X#define c_eol() if (lex_no == 2) return 1; else
- X#define count() lex_ls_pos += yyleng
- + END-OF-FILE stream.h
- chmod 'u=rw,g=r,o=r' \s\t\r\e\a\m\.\h
- set `sum \s\t\r\e\a\m\.\h`
- sum=$1
- case $sum in
- 61866) :;;
- *) echo 'Bad sum in '\s\t\r\e\a\m\.\h >&2
- esac
- echo Extracting \i\d\f\.\h
- sed 's/^X//' > \i\d\f\.\h << '+ END-OF-FILE '\i\d\f\.\h
- X/* This file is part of the software similarity tester SIM.
- X Written by Dick Grune, Vrije Universiteit, Amsterdam.
- X*/
- X
- X/* the struct for keywords etc. */
- Xstruct idf {
- X char *id_tag; /* an interesting identifier */
- X char id_tr; /* with its one-character translation */
- X};
- X
- X#define idf2char(s,l) findidf(s, l, sizeof l / sizeof l[0])
- + END-OF-FILE idf.h
- chmod 'u=rw,g=r,o=r' \i\d\f\.\h
- set `sum \i\d\f\.\h`
- sum=$1
- case $sum in
- 29562) :;;
- *) echo 'Bad sum in '\i\d\f\.\h >&2
- esac
- echo Extracting \b\u\f\f\.\h
- sed 's/^X//' > \b\u\f\f\.\h << '+ END-OF-FILE '\b\u\f\f\.\h
- X/* This file is part of the software similarity tester SIM.
- X Written by Dick Grune, Vrije Universiteit, Amsterdam.
- X*/
- X
- Xextern char *buff;
- Xextern unsigned int text_length();
- + END-OF-FILE buff.h
- chmod 'u=rw,g=r,o=r' \b\u\f\f\.\h
- set `sum \b\u\f\f\.\h`
- sum=$1
- case $sum in
- 20348) :;;
- *) echo 'Bad sum in '\b\u\f\f\.\h >&2
- esac
- echo Extracting \s\t\r\e\a\m\.\c
- sed 's/^X//' > \s\t\r\e\a\m\.\c << '+ END-OF-FILE '\s\t\r\e\a\m\.\c
- X/* This file is part of the software similarity tester SIM.
- X Written by Dick Grune, Vrije Universiteit, Amsterdam.
- X*/
- X
- X#include <stdio.h>
- X#include "stream.h"
- X
- X/* imports from the lex module */
- Xextern int yylex();
- Xextern yystart();
- Xextern FILE *yyin;
- X
- Xint lex_no; /* pass 1: return stream of C condensed chars */
- X /* pass 2: return C-char/ASCII-char position
- X pairs at each \n
- X */
- X
- Xint lex_ch; /* condensed C-char produced by pass 1 */
- Xunsigned int lex_ch_cnt;/* C-char position reported at each \n by pass 2 */
- Xlong lex_ls_pos; /* lseek position reported at each \n by pass 2 */
- X
- Xint
- XOpenStream(pass, fname)
- X char *fname;
- X{
- X lex_no = pass;
- X lex_ch_cnt = 0;
- X lex_ls_pos = 0L;
- X
- X /* start the lex machine */
- X yyin = fopen(fname, "r");
- X yystart();
- X return yyin != NULL;
- X}
- X
- Xint
- XNextChar(cp) /* lex_no must be 1 */
- X char *cp;
- X{
- X if (!yylex())
- X return -1;
- X *cp = lex_ch;
- X return 0;
- X}
- X
- Xint
- XNextPair(ccp, lsp) /* lex_no must be 2 */
- X unsigned int *ccp;
- X long *lsp;
- X{
- X if (!yylex())
- X return -1;
- X *ccp = lex_ch_cnt;
- X *lsp = lex_ls_pos;
- X return 0;
- X}
- X
- XCloseStream() {
- X fclose(yyin);
- X}
- + END-OF-FILE stream.c
- chmod 'u=rw,g=r,o=r' \s\t\r\e\a\m\.\c
- set `sum \s\t\r\e\a\m\.\c`
- sum=$1
- case $sum in
- 24424) :;;
- *) echo 'Bad sum in '\s\t\r\e\a\m\.\c >&2
- esac
- echo Extracting \i\d\f\.\c
- sed 's/^X//' > \i\d\f\.\c << '+ END-OF-FILE '\i\d\f\.\c
- X/* This file is part of the software similarity tester SIM.
- X Written by Dick Grune, Vrije Universiteit, Amsterdam.
- X*/
- X
- X#include "idf.h"
- X
- Xint
- Xfindidf(str, list, size)
- X char *str;
- X struct idf list[];
- X{
- X int first = 0;
- X int last = size - 1;
- X
- X while (first < last) {
- X int middle = (first + last) / 2;
- X
- X if (strcmp(str, list[middle].id_tag) > 0)
- X first = middle + 1;
- X else last = middle;
- X }
- X return strcmp(str, list[first].id_tag) == 0 ? list[first].id_tr : -1;
- X}
- + END-OF-FILE idf.c
- chmod 'u=rw,g=r,o=r' \i\d\f\.\c
- set `sum \i\d\f\.\c`
- sum=$1
- case $sum in
- 60560) :;;
- *) echo 'Bad sum in '\i\d\f\.\c >&2
- esac
- echo Extracting \b\u\f\f\.\c
- sed 's/^X//' > \b\u\f\f\.\c << '+ END-OF-FILE '\b\u\f\f\.\c
- X/* This file is part of the software similarity tester SIM.
- X Written by Dick Grune, Vrije Universiteit, Amsterdam.
- X*/
- X
- X#include "buff.h"
- X
- X#define BFSIZE 10000
- X
- Xextern char *malloc(), *realloc(), *calloc();
- X
- Xchar *buff; /* to be filled by malloc */
- Xstatic unsigned int buff_size; /* size of buffer at this moment */
- Xstatic unsigned int bfree; /* next free position in array buff[] */
- X
- Xinit_buff() {
- X /* Allocate the text buffer */
- X buff = malloc(buff_size = BFSIZE);
- X if (!buff)
- X error("out of space");
- X bfree = 1; /* don't use position 0 */
- X}
- X
- Xstore(ch) {
- X if (bfree == buff_size) {
- X buff = realloc(buff, buff_size += BFSIZE);
- X if (!buff || buff_size < bfree) {
- X /* overflow */
- X error("out of space");
- X }
- X }
- X buff[bfree++] = ch;
- X}
- X
- Xunsigned int
- Xtext_length() {
- X return bfree;
- X}
- + END-OF-FILE buff.c
- chmod 'u=rw,g=r,o=r' \b\u\f\f\.\c
- set `sum \b\u\f\f\.\c`
- sum=$1
- case $sum in
- 14186) :;;
- *) echo 'Bad sum in '\b\u\f\f\.\c >&2
- esac
- echo Extracting \e\r\r\o\r\.\c
- sed 's/^X//' > \e\r\r\o\r\.\c << '+ END-OF-FILE '\e\r\r\o\r\.\c
- X/* This file is part of the software similarity tester SIM.
- X Written by Dick Grune, Vrije Universiteit, Amsterdam.
- X*/
- X
- X#include <stdio.h>
- X
- Xerror(msg)
- X char *msg;
- X{
- X fprintf(stderr, "sim: %s\n", msg);
- X exit(1);
- X}
- + END-OF-FILE error.c
- chmod 'u=rw,g=r,o=r' \e\r\r\o\r\.\c
- set `sum \e\r\r\o\r\.\c`
- sum=$1
- case $sum in
- 10416) :;;
- *) echo 'Bad sum in '\e\r\r\o\r\.\c >&2
- esac
- echo Extracting \t\o\p\.\p
- sed 's/^X//' > \t\o\p\.\p << '+ END-OF-FILE '\t\o\p\.\p
- X/* This file is part of the software similarity tester SIM.
- X Written by Dick Grune, Vrije Universiteit, Amsterdam.
- X*/
- X
- X/* These are the parameters with which to instantiate top.g
- X*/
- X#define TTSIZE 100 /* the TTSIZE best objects */
- X#define TTTYPE struct run /* the type of the object */
- X#define TTBETTER longer /* how to compare objects */
- + END-OF-FILE top.p
- chmod 'u=rw,g=r,o=r' \t\o\p\.\p
- set `sum \t\o\p\.\p`
- sum=$1
- case $sum in
- 20036) :;;
- *) echo 'Bad sum in '\t\o\p\.\p >&2
- esac
- echo Extracting \t\o\p\.\g
- sed 's/^X//' > \t\o\p\.\g << '+ END-OF-FILE '\t\o\p\.\g
- X/* This file is part of the software similarity tester SIM.
- X Written by Dick Grune, Vrije Universiteit, Amsterdam.
- X*/
- X
- X/* This is a generic package for keeping a top-10 list of objects of
- X/* formal type.
- X/*
- X/* Specification in Ada-style:
- X/* generic -- 3 formals,supplied by #define
- X/* TTSIZE: int; -- the size of the top-N list
- X/* type TTTYPE is private; -- the type of the objects
- X/* with function int TTBETTER(ip, jp) TTTYPE *ip, *jp;
- X/* -- 1 if object *ip better than
- X/* -- object *jp, 0 otherwise
- X/* package TOP is -- reflected in top.h
- X/* function InitTop(); -- clears the list
- X/* function InsertTop(obj) TTTYPE *obj;
- X/* -- accepts a (pointer to) an object
- X/* -- a generator yields the objects in best-to-worst order:
- X/* type TopGen is private; -- to declare the generator
- X/* function OpenTop(tg) TopGen *tg;-- starts the generator
- X/* function TTTYPE *NextTop(tg) TopGen *tg;
- X/* -- yields next object, moves generator
- X/* NoObject: constant (TTTYPE*); -- yielded at end-of-list
- X/* function CloseTop(tg) TopGen *tg;-- stops the generator
- X/* end TOP -- */
- X/* The application of this file must be preceded by
- X/* #include "top.h", which defines the interface, and by
- X/* #include "top.p", which defines the parameters of the instantiation.
- X
- X/* package body TOP is -- */
- Xstatic TTTYPE val[TTSIZE];
- Xstatic TTTYPE *list[TTSIZE];
- Xstatic int cnt;
- X
- XInitTop() {
- X cnt = 0;
- X}
- X
- XInsertTop(obj) register TTTYPE *obj; {
- X register int i;
- X
- X if (cnt < TTSIZE) { /* there is still room */
- X list[cnt] = &val[cnt];
- X val[cnt++] = *obj;
- X }
- X else
- X if (TTBETTER(obj, list[TTSIZE-1])) {
- X /* preferable to worst in set */
- X *list[TTSIZE-1] = *obj;
- X }
- X else return; /* we're not interested */
- X
- X for (i = cnt-2; i >= 0 && TTBETTER(list[i+1], list[i]); i--) {
- X register TTTYPE *jp = list[i+1];
- X list[i+1] = list[i];
- X list[i] = jp;
- X }
- X}
- X
- XOpenTop(tg) register TopGen *tg; {
- X *tg = 0;
- X}
- X
- XTTTYPE *
- XNextTop(tg) register TopGen *tg; {
- X return *tg >= cnt ? NoObject : list[(*tg)++];
- X}
- X
- XCloseTop(tg) register TopGen *tg; {
- X *tg = TTSIZE;
- X}
- X/* end TOP -- */
- + END-OF-FILE top.g
- chmod 'u=rw,g=r,o=r' \t\o\p\.\g
- set `sum \t\o\p\.\g`
- sum=$1
- case $sum in
- 62947) :;;
- *) echo 'Bad sum in '\t\o\p\.\g >&2
- esac
- echo Extracting \t\o\p\.\h
- sed 's/^X//' > \t\o\p\.\h << '+ END-OF-FILE '\t\o\p\.\h
- X/* This file is part of the software similarity tester SIM.
- X Written by Dick Grune, Vrije Universiteit, Amsterdam.
- X*/
- X
- X/* This is the public interface of top.g, a generic package for keeping
- X/* a top-10 list of objects of formal type.
- X/*
- X/* The application of this file must be preceded by
- X/* #include "top.p", which defines the parameters of the instantiation.
- X*/
- X
- Xextern InitTop();
- Xextern InsertTop();
- Xtypedef int TopGen;
- Xextern OpenTop();
- Xextern TTTYPE *NextTop();
- X#define NoObject ((TTTYPE*)0)
- Xextern CloseTop();
- + END-OF-FILE top.h
- chmod 'u=rw,g=r,o=r' \t\o\p\.\h
- set `sum \t\o\p\.\h`
- sum=$1
- case $sum in
- 48552) :;;
- *) echo 'Bad sum in '\t\o\p\.\h >&2
- esac
- echo Extracting \t\o\p\.\c
- sed 's/^X//' > \t\o\p\.\c << '+ END-OF-FILE '\t\o\p\.\c
- X/* This file is part of the software similarity tester SIM.
- X Written by Dick Grune, Vrije Universiteit, Amsterdam.
- X*/
- X
- X#include "text.h"
- X#include "top.p"
- X#include "top.h"
- X
- X#ifndef lint /* lint won't take this define ?!?!?! */
- X#define longer(r0,r1) (r0->rn_quality > r1->rn_quality)
- X
- X#else
- Xstatic int
- Xlonger(r0, r1) struct run *r0, *r1; {
- X return r0->rn_quality > r1->rn_quality;
- X}
- X#endif lint
- X
- X/* Instantiate top.g */
- X#include "top.g"
- + END-OF-FILE top.c
- chmod 'u=rw,g=r,o=r' \t\o\p\.\c
- set `sum \t\o\p\.\c`
- sum=$1
- case $sum in
- 10035) :;;
- *) echo 'Bad sum in '\t\o\p\.\c >&2
- esac
- echo Extracting \p\a\r\a\m\s\.\h
- sed 's/^X//' > \p\a\r\a\m\s\.\h << '+ END-OF-FILE '\p\a\r\a\m\s\.\h
- X/* This file is part of the software similarity tester SIM.
- X Written by Dick Grune, Vrije Universiteit, Amsterdam.
- X*/
- X
- X#define MIN_RUN 24 /* default minimum size
- X of interesting run
- X */
- X#define PAGE_WIDTH 80
- + END-OF-FILE params.h
- chmod 'u=rw,g=r,o=r' \p\a\r\a\m\s\.\h
- set `sum \p\a\r\a\m\s\.\h`
- sum=$1
- case $sum in
- 36700) :;;
- *) echo 'Bad sum in '\p\a\r\a\m\s\.\h >&2
- esac
- echo Extracting \t\e\x\t\.\h
- sed 's/^X//' > \t\e\x\t\.\h << '+ END-OF-FILE '\t\e\x\t\.\h
- X/* This file is part of the software similarity tester SIM.
- X Written by Dick Grune, Vrije Universiteit, Amsterdam.
- X*/
- X
- Xstruct text {
- X char *tx_fname; /* the file name */
- X int tx_needed; /* set if file plays a role in final output */
- X unsigned int tx_start; /* positions in buff for the text */
- X unsigned int tx_limit;
- X};
- X
- Xstruct chunk {
- X /* a chunk of text in various representations */
- X struct text *ch_text; /* a pointer to the file text */
- X unsigned int ch_st_ch; /* first in chunk, counted in C-chars */
- X unsigned int ch_lm_ch; /* first not in chunk */
- X long ch_st_ls; /* same in lseek positions */
- X long ch_lm_ls;
- X unsigned int ch_st_nl; /* same in line numbers */
- X unsigned int ch_lm_nl;
- X};
- X
- Xstruct run { /* a 'run' of coincident chars */
- X struct chunk rn_cn0; /* chunk in left file */
- X struct chunk rn_cn1; /* chunk in right file */
- X unsigned int rn_quality;
- X};
- + END-OF-FILE text.h
- chmod 'u=rw,g=r,o=r' \t\e\x\t\.\h
- set `sum \t\e\x\t\.\h`
- sum=$1
- case $sum in
- 43449) :;;
- *) echo 'Bad sum in '\t\e\x\t\.\h >&2
- esac
- echo Extracting \d\e\b\u\g\.\h
- sed 's/^X//' > \d\e\b\u\g\.\h << '+ END-OF-FILE '\d\e\b\u\g\.\h
- X/* This file is part of the software similarity tester SIM.
- X Written by Dick Grune, Vrije Universiteit, Amsterdam.
- X*/
- X
- X#define DEBUG 0
- + END-OF-FILE debug.h
- chmod 'u=rw,g=r,o=r' \d\e\b\u\g\.\h
- set `sum \d\e\b\u\g\.\h`
- sum=$1
- case $sum in
- 64324) :;;
- *) echo 'Bad sum in '\d\e\b\u\g\.\h >&2
- esac
- echo Extracting \s\i\m\.\c
- sed 's/^X//' > \s\i\m\.\c << '+ END-OF-FILE '\s\i\m\.\c
- X/* This file is part of the software similarity tester SIM.
- X Written by Dick Grune, Vrije Universiteit, Amsterdam.
- X*/
- X
- X#include "params.h"
- X
- Xint min_run_size = MIN_RUN;
- X
- Xchar options[128]; /* for various, extensible flags */
- X
- Xmain(argc, argv)
- X char *argv[];
- X{
- X argv++, argc--; /* skip program name */
- X
- X while (argc > 0 && argv[0][0] == '-') {
- X char *par = &argv[0][1];
- X
- X while (*par) {
- X switch (*par) {
- X case 'r':
- X min_run_size = atoi(argv[1]);
- X argc--, argv++;
- X break;
- X default:
- X options[*par]++;
- X break;
- X }
- X par++;
- X }
- X argc--, argv++;
- X }
- X if (min_run_size == 0)
- X error("Minimum run size equals 0");
- X
- X init_buff();
- X
- X /* Read the files */
- X pass1(argv, argc);
- X
- X /* Set up the hash table */
- X make_hash();
- X
- X /* Compare various files */
- X compare();
- X
- X /* Delete hash table */
- X free_hash();
- X
- X /* Find positions of found similarities */
- X pass2();
- X
- X /* Print the similarities */
- X pass3();
- X return 0;
- X}
- + END-OF-FILE sim.c
- chmod 'u=rw,g=r,o=r' \s\i\m\.\c
- set `sum \s\i\m\.\c`
- sum=$1
- case $sum in
- 42365) :;;
- *) echo 'Bad sum in '\s\i\m\.\c >&2
- esac
- echo Extracting \p\a\s\s\1\.\c
- sed 's/^X//' > \p\a\s\s\1\.\c << '+ END-OF-FILE '\p\a\s\s\1\.\c
- X/* This file is part of the software similarity tester SIM.
- X Written by Dick Grune, Vrije Universiteit, Amsterdam.
- X*/
- X
- X#include "buff.h"
- X#include "text.h"
- X
- Xextern char *calloc();
- X
- Xstruct text *text; /* to be filled in by calloc */
- Xint ntexts; /* number of text records */
- X
- Xpass1(argv, argc)
- X char *argv[];
- X{
- X int n;
- X
- X /* allocate the array of text descriptors */
- X ntexts = argc;
- X text = (struct text *)calloc((unsigned)ntexts, sizeof (struct text));
- X if (!text)
- X error("Too many files");
- X
- X /* read the files */
- X for (n = 0; n < ntexts; n++) {
- X char *fname = argv[n];
- X struct text *txt = &text[n];
- X char ch;
- X
- X printf("File %s: ", fname);
- X
- X txt->tx_fname = fname;
- X txt->tx_start = txt->tx_limit = text_length();
- X if (!OpenStream(1, txt->tx_fname)) {
- X printf("cannot open\n");
- X OpenStream(1, "/dev/null");
- X }
- X while (NextChar(&ch) == 0)
- X store(ch);
- X CloseStream();
- X
- X txt->tx_limit = text_length();
- X printf("%u C-units\n", txt->tx_limit - txt->tx_start);
- X }
- X printf("Total: %u C-units\n", text_length() - 1);
- X printf("\n");
- X}
- + END-OF-FILE pass1.c
- chmod 'u=rw,g=r,o=r' \p\a\s\s\1\.\c
- set `sum \p\a\s\s\1\.\c`
- sum=$1
- case $sum in
- 58561) :;;
- *) echo 'Bad sum in '\p\a\s\s\1\.\c >&2
- esac
- echo Extracting \h\a\s\h\.\c
- sed 's/^X//' > \h\a\s\h\.\c << '+ END-OF-FILE '\h\a\s\h\.\c
- X/* This file is part of the software similarity tester SIM.
- X Written by Dick Grune, Vrije Universiteit, Amsterdam.
- X*/
- X
- X#include "buff.h"
- X#include "text.h"
- X
- Xextern char *calloc();
- X
- Xextern char options[];
- Xextern int ntexts;
- Xextern struct text *text;
- Xextern int min_run_size;
- X
- Xstatic int hash_code();
- Xstatic print_hash();
- X
- X#define N_HASH 10639 /* any suitable prime */
- X
- Xunsigned int *hash_table; /* to be filled by malloc() */
- X
- X/* to judge the quality of the hash code */
- Xstatic tally_right = 0, tally_wrong = 0;
- Xstatic tally_hash(), print_tally();
- X
- Xmake_hash() {
- X unsigned int last[N_HASH];
- X /* last[i] is the index of the latest char with hash_code i,
- X or 0 if there is none.
- X */
- X int n;
- X
- X for (n = 0; n < N_HASH; n++)
- X last[n] = 0;
- X
- X hash_table = (unsigned int *)
- X calloc(text_length(), sizeof (unsigned int));
- X if (options['x'])
- X hash_table = 0;
- X if (!hash_table) {
- X printf(">>> Not enough memory for the hash table, ");
- X printf("this is going to take time!\n\n");
- X return;
- X }
- X
- X for (n = 0; n < ntexts; n++) {
- X struct text *txt = &text[n];
- X unsigned int j;
- X
- X for (
- X j = txt->tx_start;
- X j < txt->tx_limit - min_run_size + 1;
- X j++
- X ) {
- X int h = hash_code(&buff[j]);
- X
- X if (last[h]) {
- X hash_table[last[h]] = j;
- X if (options['h'])
- X tally_hash(last[h], j);
- X }
- X last[h] = j;
- X }
- X }
- X if (options['h'])
- X print_tally();
- X if (options['H'])
- X print_hash();
- X}
- X
- Xstatic int
- Xhash_code(p)
- X char *p;
- X{
- X /* hash_code(p) returns the hash code of the min_run_size first
- X characters starting at p; caller guarantees that there
- X are at least min_run_size chars.
- X */
- X int h = 0;
- X int i;
- X
- X for (i = 0; i < min_run_size; i++)
- X h = ((h << 1) + *p++) % N_HASH;
- X return h;
- X}
- X
- Xstatic
- Xprint_hash()
- X{
- X /* will not be called if hash_table == 0 */
- X unsigned int i;
- X
- X for (i = 1; i < text_length(); i++) {
- X printf("%d: %c: ", i, buff[i]);
- X printf("%u\n", hash_table[i]);
- X }
- X}
- X
- Xstatic
- Xtally_hash(i0, i1)
- X unsigned int i0, i1;
- X{
- X int i;
- X
- X for (i = 0; i < min_run_size; i++) {
- X if (buff[i0++] != buff[i1++]) {
- X tally_wrong++;
- X return;
- X }
- X }
- X tally_right++;
- X}
- X
- Xstatic
- Xprint_tally()
- X{
- X printf("Tally_right = %d, tally_wrong = %d, ",
- X tally_right, tally_wrong);
- X printf("hash code efficiency = %d%%\n",
- X 100 * tally_right / (tally_right + tally_wrong));
- X}
- X
- Xfree_hash() {
- X if (hash_table)
- X free((char *)hash_table);
- X}
- + END-OF-FILE hash.c
- chmod 'u=rw,g=r,o=r' \h\a\s\h\.\c
- set `sum \h\a\s\h\.\c`
- sum=$1
- case $sum in
- 22444) :;;
- *) echo 'Bad sum in '\h\a\s\h\.\c >&2
- esac
- echo Extracting \c\o\m\p\a\r\e\.\c
- sed 's/^X//' > \c\o\m\p\a\r\e\.\c << '+ END-OF-FILE '\c\o\m\p\a\r\e\.\c
- X/* This file is part of the software similarity tester SIM.
- X Written by Dick Grune, Vrije Universiteit, Amsterdam.
- X*/
- X
- X#include "buff.h"
- X#include "text.h"
- X#include "top.p"
- X#include "top.h"
- X
- X/* from the Language Department: */
- Xextern int MayBeStartOfRun();
- Xextern unsigned int CheckRun();
- X
- Xextern char options[];
- Xextern int ntexts;
- Xextern struct text *text;
- Xextern int min_run_size;
- Xextern unsigned int *hash_table;
- Xextern add_run();
- X
- Xstatic struct text *txt_at();
- Xstatic unsigned int lcs();
- X
- Xcompare()
- X{
- X int n;
- X
- X InitTop();
- X for (n = 0; n < ntexts; n++) {
- X struct text *txt0 = &text[n];
- X unsigned int i0 = txt0->tx_start;
- X
- X while (i0 < txt0->tx_limit - min_run_size + 1) {
- X i0 += lcs(txt0, i0);
- X }
- X }
- X}
- X
- Xstatic unsigned int
- Xlcs(txt0, i0)
- X struct text *txt0;
- X unsigned int i0;
- X{
- X /* find the longest common substring in:
- X txt0, starting precisely at i0 and
- X the rest of the text
- X */
- X struct text *txt1 = txt0;
- X unsigned int i1 = i0;
- X struct text *txt_best;
- X unsigned int i_best;
- X unsigned int size_best = 0;
- X
- X if (!MayBeStartOfRun(buff[i0])) {
- X return 1;
- X }
- X
- X while(
- X i1 = hash_table ? hash_table[i1] : i1 + 1,
- X txt1 = txt_at(txt1, i1)
- X ) {
- X
- X if ( /* we don't want to compare a file to itself */
- X options['s'] && i1 < txt0->tx_limit
- X ) {
- X /* skip this possibility */
- X }
- X else
- X if ( /* we are looking at the middle of a run */
- X i0 != txt0->tx_start && i1 != txt1->tx_start &&
- X buff[i0-1] == buff[i1-1]
- X ) {
- X /* skip this possibility */
- X }
- X else {
- X /* see how far we can get */
- X unsigned int j0 = i0, j1 = i1;
- X unsigned int size = 0;
- X unsigned int limit0 = txt0->tx_limit;
- X unsigned int limit1 = txt1->tx_limit;
- X
- X while ( size < j1 - j0 &&
- X j0 < limit0 && j1 < limit1 &&
- X buff[j0] == buff[j1]
- X ) {
- X j0++, j1++, size++;
- X }
- X
- X if (size >= min_run_size) {
- X /* offer the run to the
- X Language Department
- X */
- X size = CheckRun(&buff[i0], size);
- X }
- X
- X if ( /* we still have something better */
- X size >= min_run_size && size > size_best
- X ) {
- X /* record it */
- X txt_best = txt1;
- X i_best = i1;
- X size_best = size;
- X }
- X }
- X }
- X if (size_best) {
- X add_run(txt0, i0, txt_best, i_best, size_best);
- X return size_best;
- X }
- X else
- X return 1;
- X}
- X
- Xstatic struct text *
- Xtxt_at(txt, i)
- X struct text *txt;
- X unsigned int i;
- X{
- X if (i == 0 || i >= text_length())
- X return 0;
- X while (i >= txt->tx_limit)
- X txt++;
- X return txt;
- X}
- + END-OF-FILE compare.c
- chmod 'u=rw,g=r,o=r' \c\o\m\p\a\r\e\.\c
- set `sum \c\o\m\p\a\r\e\.\c`
- sum=$1
- case $sum in
- 22558) :;;
- *) echo 'Bad sum in '\c\o\m\p\a\r\e\.\c >&2
- esac
- echo Extracting \a\d\d\_\r\u\n\.\c
- sed 's/^X//' > \a\d\d\_\r\u\n\.\c << '+ END-OF-FILE '\a\d\d\_\r\u\n\.\c
- X/* This file is part of the software similarity tester SIM.
- X Written by Dick Grune, Vrije Universiteit, Amsterdam.
- X*/
- X
- X#include "buff.h"
- X#include "text.h"
- X#include "top.p"
- X#include "top.h"
- X
- Xstatic set_chunk();
- X
- Xadd_run(txt0, i0, txt1, i1, size)
- X struct text *txt0, *txt1;
- X unsigned int i0, i1;
- X unsigned int size;
- X{
- X /* Adds the run of given size to our collection.
- X */
- X struct run r;
- X
- X set_chunk(&r.rn_cn0, txt0, i0 - txt0->tx_start, size);
- X set_chunk(&r.rn_cn1, txt1, i1 - txt1->tx_start, size);
- X r.rn_quality = size;
- X
- X InsertTop(&r);
- X}
- X
- Xstatic
- Xset_chunk(cnk, txt, index, size)
- X struct chunk *cnk;
- X struct text *txt;
- X unsigned int index;
- X unsigned int size;
- X{
- X /* Fill the chunk *cnk with info about the piece of text
- X in txt starting at index extending over size characters.
- X */
- X txt->tx_needed = 1;
- X cnk->ch_text = txt;
- X cnk->ch_st_ch = index;
- X cnk->ch_lm_ch = index + size;
- X cnk->ch_st_ls = cnk->ch_lm_ls = 0;
- X cnk->ch_st_nl = cnk->ch_lm_nl = 1;
- X}
- + END-OF-FILE add_run.c
- chmod 'u=rw,g=r,o=r' \a\d\d\_\r\u\n\.\c
- set `sum \a\d\d\_\r\u\n\.\c`
- sum=$1
- case $sum in
- 53616) :;;
- *) echo 'Bad sum in '\a\d\d\_\r\u\n\.\c >&2
- esac
- echo Extracting \p\a\s\s\2\.\c
- sed 's/^X//' > \p\a\s\s\2\.\c << '+ END-OF-FILE '\p\a\s\s\2\.\c
- X/* This file is part of the software similarity tester SIM.
- X Written by Dick Grune, Vrije Universiteit, Amsterdam.
- X*/
- X
- X#include "text.h"
- X#include "top.p"
- X#include "top.h"
- X#include "debug.h"
- X
- Xextern int ntexts;
- Xextern struct text *text;
- X
- Xstatic upd_top(), upd_chunk();
- X
- Xpass2() {
- X int n;
- X
- X for (n = 0; n < ntexts; n++) {
- X struct text *txt = &text[n];
- X unsigned int ch_cnt;
- X long ls_pos;
- X unsigned int nl_cnt = 1;
- X
- X if (!txt->tx_needed) /* an optimization */
- X continue;
- X
- X if (!OpenStream(2, txt->tx_fname)) {
- X printf("*** File %s disappeared\n", txt->tx_fname);
- X OpenStream(2, "/dev/null");
- X }
- X
- X while (NextPair(&ch_cnt, &ls_pos) == 0) {
- X /* fill in the lseek and line positions
- X in the collected runs
- X */
- X nl_cnt++;
- X#if DEBUG == 1
- X printf("pass2 on %s: ch_cnt = %u, ls_pos = %ld\n",
- X txt->tx_fname, ch_cnt, ls_pos);
- X#endif DEBUG == 1
- X upd_top(txt, ch_cnt, ls_pos, nl_cnt);
- X }
- X CloseStream();
- X }
- X}
- X
- Xstatic
- Xupd_top(txt, ch_cnt, ls_pos, nl_cnt)
- X struct text *txt;
- X unsigned int ch_cnt;
- X long ls_pos;
- X unsigned int nl_cnt;
- X{
- X TopGen tp;
- X struct run *run;
- X
- X OpenTop(&tp);
- X while ((run = NextTop(&tp)), run != NoObject) {
- X struct chunk *cnk0 = &run->rn_cn0;
- X struct chunk *cnk1 = &run->rn_cn1;
- X
- X if (cnk0->ch_text == txt)
- X upd_chunk(cnk0, ch_cnt, ls_pos, nl_cnt);
- X if (cnk1->ch_text == txt)
- X upd_chunk(cnk1, ch_cnt, ls_pos, nl_cnt);
- X }
- X CloseTop(&tp);
- X}
- X
- Xstatic
- Xupd_chunk(cnk, ch_cnt, ls_pos, nl_cnt)
- X struct chunk *cnk;
- X unsigned int ch_cnt;
- X long ls_pos;
- X unsigned int nl_cnt;
- X{
- X if (ch_cnt <= cnk->ch_st_ch) {
- X cnk->ch_st_ls = ls_pos;
- X cnk->ch_st_nl = nl_cnt;
- X }
- X if (cnk->ch_lm_ls == 0 && cnk->ch_lm_ch <= ch_cnt) {
- X cnk->ch_lm_ls = ls_pos;
- X cnk->ch_lm_nl = nl_cnt;
- X }
- X}
- + END-OF-FILE pass2.c
- chmod 'u=rw,g=r,o=r' \p\a\s\s\2\.\c
- set `sum \p\a\s\s\2\.\c`
- sum=$1
- case $sum in
- 15182) :;;
- *) echo 'Bad sum in '\p\a\s\s\2\.\c >&2
- esac
- echo Extracting \p\a\s\s\3\.\c
- sed 's/^X//' > \p\a\s\s\3\.\c << '+ END-OF-FILE '\p\a\s\s\3\.\c
- X/* This file is part of the software similarity tester SIM.
- X Written by Dick Grune, Vrije Universiteit, Amsterdam.
- X*/
- X
- X#include <stdio.h>
- X#include "params.h"
- X#include "text.h"
- X#include "top.p"
- X#include "top.h"
- X#include "debug.h"
- X
- Xextern char options[];
- X
- Xstatic FILE *chunk_open();
- Xstatic unsigned int fill_line();
- Xstatic show_chunk(), show_line(), clear_line(), print_run();
- X
- X#define MAXLINE (PAGE_WIDTH/2-2)
- X
- Xpass3() {
- X TopGen tp;
- X struct run *run;
- X
- X OpenTop(&tp);
- X while ((run = NextTop(&tp)), run != NoObject) {
- X print_run(run);
- X show_chunk(run);
- X printf("\n");
- X }
- X CloseTop(&tp);
- X}
- X
- Xstatic
- Xprint_run(run)
- X struct run *run;
- X{
- X#if DEBUG == 1
- X#include "buff.h"
- X unsigned int i;
- X struct chunk *cnk0 = &run->rn_cn0;
- X struct chunk *cnk1 = &run->rn_cn1;
- X
- X printf("File %s vs. file %s:\n",
- X cnk0->ch_text->tx_fname,
- X cnk1->ch_text->tx_fname
- X );
- X printf("from C-char %d,%d to %d,%d:",
- X cnk0->ch_st_ch, cnk1->ch_st_ch,
- X cnk0->ch_lm_ch, cnk1->ch_lm_ch
- X );
- X printf(" from ASCII-char %d,%d to %d,%d:",
- X cnk0->ch_st_ls, cnk1->ch_st_ls,
- X cnk0->ch_lm_ls, cnk1->ch_lm_ls
- X );
- X printf(" from lines %d,%d to %d,%d:",
- X cnk0->ch_st_nl, cnk1->ch_st_nl,
- X cnk0->ch_lm_nl - 1, cnk1->ch_lm_nl - 1
- X );
- X printf(" %d C-chars\n", run->rn_quality);
- X
- X /* show C-chars, with a one-char margin */
- X for ( i = cnk0->ch_st_ch - 1;
- X i < cnk0->ch_lm_ch + 1;
- X i++
- X ) {
- X putchar(buff[cnk0->ch_text->tx_start + i]);
- X }
- X printf("\n");
- X for ( i = cnk1->ch_st_ch - 1;
- X i < cnk1->ch_lm_ch + 1;
- X i++
- X ) {
- X putchar(buff[cnk1->ch_text->tx_start + i]);
- X }
- X printf("\n");
- X#endif DEBUG == 1
- X
- X#ifdef lint
- X run = run;
- X#endif lint
- X}
- X
- X
- Xstatic
- Xshow_chunk(run)
- X struct run *run;
- X{
- X /* The animals came in two by two ... */
- X struct chunk *cnk0 = &run->rn_cn0;
- X struct chunk *cnk1 = &run->rn_cn1;
- X unsigned int nl_cnt0 = cnk0->ch_lm_nl - cnk0->ch_st_nl;
- X unsigned int nl_cnt1 = cnk1->ch_lm_nl - cnk1->ch_st_nl;
- X FILE *f0;
- X FILE *f1;
- X char line0[MAXLINE + 1];
- X char line1[MAXLINE + 1];
- X extern char *sprintf();
- X
- X sprintf(line0, "%s: line %d-%d",
- X cnk0->ch_text->tx_fname,
- X cnk0->ch_st_nl, cnk0->ch_lm_nl - 1, run->rn_quality);
- X sprintf(line1, "%s: line %d-%d",
- X cnk1->ch_text->tx_fname,
- X cnk1->ch_st_nl, cnk1->ch_lm_nl - 1, run->rn_quality);
- X show_line(line0, line1);
- X if (options['n'])
- X return; /* ... had enough so soon ... */
- X
- X f0 = chunk_open(cnk0);
- X f1 = chunk_open(cnk1);
- X
- X /* fill lines and print them */
- X while (nl_cnt0 != 0 || nl_cnt1 != 0) {
- X if (nl_cnt0) {
- X fill_line(f0, line0);
- X nl_cnt0--;
- X }
- X else clear_line(line0);
- X if (nl_cnt1) {
- X fill_line(f1, line1);
- X nl_cnt1--;
- X }
- X else clear_line(line1);
- X show_line(line0, line1);
- X }
- X
- X fclose(f0);
- X fclose(f1);
- X}
- X
- Xstatic FILE *
- Xchunk_open(cnk)
- X struct chunk *cnk;
- X{
- X /* opens the file in which the chunk resides and positions
- X the file at the beginning of the chunk
- X */
- X char *fname = cnk->ch_text->tx_fname;
- X FILE *f = fopen(fname, "r");
- X
- X if (f == NULL) {
- X printf("*** File %s disappeared\n", fname);
- X f = fopen("/dev/null", "r");
- X }
- X fseek(f, cnk->ch_st_ls, 0);
- X return f;
- X}
- X
- Xstatic unsigned int
- Xfill_line(f, ln)
- X FILE *f;
- X char ln[];
- X{
- X /* Reads one line from f and puts it in condensed form in ln.
- X */
- X int ch;
- X int indent = 0, lpos = 0;
- X
- X /* condense and skip initial blank */
- X while ((ch = getc(f)), ch == ' ' || ch == '\t') {
- X if (ch == '\t')
- X indent = 8;
- X else
- X indent++;
- X if (indent == 8) {
- X /* every eight blanks give one blank */
- X if (lpos < MAXLINE)
- X ln[lpos++] = ' ';
- X indent = 0;
- X }
- X }
- X
- X /* store the rest */
- X while (ch >= 0 && ch != '\n') {
- X if (ch == '\t') /* replace tabs by blanks */
- X ch = ' ';
- X if (lpos < MAXLINE)
- X ln[lpos++] = ch;
- X ch = getc(f);
- X }
- X ln[lpos] = '\0'; /* always room for this one */
- X}
- X
- Xstatic
- Xclear_line(ln)
- X char ln[];
- X{
- X /* a simple null byte will suffice */
- X ln[0] = '\0';
- X}
- X
- Xstatic
- Xshow_line(ln0, ln1)
- X char ln0[], ln1[];
- X{
- X int i;
- X
- X for (i = 0; i < MAXLINE && ln0[i] != '\0'; i++)
- X putchar(ln0[i]);
- X for (; i < MAXLINE; i++)
- X putchar(' ');
- X printf(" |");
- X
- X for (i = 0; i < MAXLINE && ln1[i] != '\0'; i++)
- X putchar(ln1[i]);
- X printf("\n");
- X}
- + END-OF-FILE pass3.c
- chmod 'u=rw,g=r,o=r' \p\a\s\s\3\.\c
- set `sum \p\a\s\s\3\.\c`
- sum=$1
- case $sum in
- 35264) :;;
- *) echo 'Bad sum in '\p\a\s\s\3\.\c >&2
- esac
- echo Extracting \m\a\i\n\.\c
- sed 's/^X//' > \m\a\i\n\.\c << '+ END-OF-FILE '\m\a\i\n\.\c
- X/* This file is part of the software similarity tester SIM.
- X Written by Dick Grune, Vrije Universiteit, Amsterdam.
- X*/
- X
- X/*
- X This is a service program for the similarity tester.
- X A call of 'cstream -1 inp.c' yields the tokens of inp.c as
- X single characters, as used by pass1 of sim.
- X A call of 'cstream -2 inp.c' yields a list of <token_number,
- X character_number> pairs, one for each line. This is used by
- X pass3.
- X*/
- X
- X#include <stdio.h>
- X
- Xchar options[128];
- X
- Xmain(argc, argv)
- X char *argv[];
- X{
- X if (argc != 3) {
- X fprintf(stderr, "Call is: %s -[12] inp.c\n", argv[0]);
- X return 1;
- X }
- X
- X if (!OpenStream(argv[1][1] - '0', argv[2])) {
- X fprintf(stderr, "%s: cannot open\n", argv[2]);
- X return 1;
- X }
- X
- X if (argv[1][1] == '1') {
- X char ch;
- X
- X while (NextChar(&ch) == 0)
- X putchar(ch);
- X }
- X else
- X if (argv[1][1] == '2') {
- X unsigned int ch_cnt;
- X long ls_pos;
- X
- X while (NextPair(&ch_cnt, &ls_pos) == 0)
- X printf("%ld,%ld\n", ch_cnt, ls_pos);
- X }
- X return 0;
- X}
- + END-OF-FILE main.c
- chmod 'u=rw,g=r,o=r' \m\a\i\n\.\c
- set `sum \m\a\i\n\.\c`
- sum=$1
- case $sum in
- 07408) :;;
- *) echo 'Bad sum in '\m\a\i\n\.\c >&2
- esac
- echo Extracting \c\l\a\n\g\.\l
- sed 's/^X//' > \c\l\a\n\g\.\l << '+ END-OF-FILE '\c\l\a\n\g\.\l
- X%{
- X/* This file is part of the software similarity tester SIM.
- X Written by Dick Grune, Vrije Universiteit, Amsterdam.
- X*/
- X
- X/*
- X C language front end for the similarity tester.
- X*/
- X
- X/* Language-dependent Code */
- X#include "idf.h"
- X
- Xstatic struct idf ppcmd[] = {
- X "define", 'D',
- X "else", 'E',
- X "endif", 'Z',
- X "if", 'F',
- X "ifdef", 'Y',
- X "ifndef", 'N',
- X "include", 'I',
- X "line", 'L',
- X "undef", 'U'
- X};
- X
- Xstatic struct idf reserved[] = {
- X "auto", 'a',
- X "break", 'b',
- X "case", 'k',
- X "char", 'c',
- X "continue", 'z',
- X "default", '_',
- X "do", 'd',
- X "double", 'm',
- X "else", 'e',
- X "enum", 'n',
- X "extern", 'q',
- X "float", 'y',
- X "for", 'f',
- X "goto", 'g',
- X "if", 'i',
- X "int", 'j',
- X "long", 'l',
- X "register", '\0', /* ignore */
- X "return", 'r',
- X "short", 'h',
- X "sizeof", 'o',
- X "static", 'p',
- X "struct", 's',
- X "switch", 'x',
- X "typedef", 't',
- X "union", 'u',
- X "unsigned", 'v',
- X "void", '\0', /* ignore */
- X "while", 'w'
- X};
- X
- Xstatic int
- Xis_trailer(ch) {
- X return ch == ')' || ch == '}' || ch == ';';
- X}
- X
- Xint
- XMayBeStartOfRun(ch) {
- X return !is_trailer(ch);
- X}
- X
- Xunsigned int
- XCheckRun(str, size) char *str; unsigned int size; {
- X /* Checks the run starting at str with length size for
- X acceptability in the language. Cuts from the end if
- X necessary and returns the accepted length (which may
- X be zero).
- X */
- X extern char options[];
- X
- X if (options['f']) { /* function-like forms only */
- X unsigned int pos;
- X unsigned int lb_pos = 0;/* latest balancing position */
- X int braces = 0;
- X int parens = 0;
- X int brackets = 0;
- X
- X for (pos = 0; pos < size; pos++) {
- X switch (str[pos]) {
- X case '{': braces++; break;
- X case '}': braces--; break;
- X case '(': parens++; break;
- X case ')': parens--; break;
- X case '[': brackets++; break;
- X case ']': brackets--; break;
- X }
- X if ( /* this was one closer too many */
- X braces < 0 || parens < 0 || brackets < 0
- X ) {
- X break;
- X }
- X if ( /* it happens to balance here */
- X braces == 0 && parens == 0 && brackets == 0
- X ) {
- X lb_pos = pos + 1;
- X }
- X }
- X size = lb_pos; /* cut to size */
- X }
- X else {
- X while ( /* there is trailing garbage */
- X size != 0 &&
- X (str[size - 1] == '@' || is_trailer(str[size - 1]))
- X ) {
- X /* remove it */
- X size--;
- X }
- X }
- X return size;
- X}
- X
- X/* Language-INdependent Code */
- X#include "stream.h"
- X
- Xyystart() {
- X BEGIN INITIAL;
- X}
- X
- Xstatic int
- Xyywrap() {
- X return 1;
- X}
- X
- X%}
- X
- X%Start Comment
- X
- XAnyQuoted (\\.)
- XStrChar ([^"\n\\]|{AnyQuoted})
- XChrChar ([^'\\]|{AnyQuoted})
- XComChar ([^*\n]|(\*[^/]))
- XIdf ([A-Za-z][A-Za-z0-9_]*)
- X
- X%%
- X
- X\"{StrChar}*\" { /* strings */
- X cput('"');
- X count();
- X }
- X
- X\'{ChrChar}\' { /* characters */
- X cput('\'');
- X count();
- X }
- X
- X"/*" { /* We cannot have one pattern for a comment
- X (although one can be written), since the matched
- X string would overflow lex-internal buffers like
- X yysbuf and yytext. So we have to break up the string
- X into lines and keep track of where we are in a start
- X condition <Comment>.
- X */
- X BEGIN Comment;
- X count();
- X }
- X
- X<Comment>{ComChar}* { /* comment up to \n or end-of-comment */
- X count();
- X }
- X
- X<Comment>"*/" { /* end-of-comment */
- X BEGIN INITIAL;
- X count();
- X }
- X
- X#[ \t]*include.* { /* skip #include line */
- X count();
- X }
- X
- X#[ \t]*{Idf} { /* a preprocessor line */
- X char *n = yytext+1;
- X int ch;
- X
- X /* skip layout in front of preprocessor identifier */
- X while (*n == ' ' || *n == '\t')
- X n++;
- X ch = idf2char(n, ppcmd);
- X if (ch < 0) {
- X cput('#');
- X }
- X else
- X cput(ch);
- X count();
- X }
- X
- X{Idf} {
- X int ch = idf2char(yytext, reserved);
- X
- X if (ch < 0)
- X cput('@');
- X else
- X if (ch > 0)
- X cput(ch);
- X count();
- X }
- X
- X[ \t] { /* layout */
- X count();
- X }
- X
- X\n { /* count newlines */
- X count();
- X c_eol();
- X }
- X
- X. { /* copy other text */
- X cput(yytext[0]);
- X count();
- X }
- X
- X%%
- + END-OF-FILE clang.l
- chmod 'u=rw,g=r,o=r' \c\l\a\n\g\.\l
- set `sum \c\l\a\n\g\.\l`
- sum=$1
- case $sum in
- 21016) :;;
- *) echo 'Bad sum in '\c\l\a\n\g\.\l >&2
- esac
- exit 0
-
-
-