home *** CD-ROM | disk | FTP | other *** search
- /* */
- /*B-SORT a sorting program */
- /* */
- /* This is a self-contained file except for the standard included */
- /* definitions and is part of the larger development of a message- */
- /* retrieval system. Since the algorithms involved are well known */
- /* it is felt that putting this program in public domain would serve */
- /* a useful purpose, namely provide some examples of some programming*/
- /* techniques using the 'C' language. The compiler used is the BDS */
- /* vs 1.46 and use is made of at least one special feature of this */
- /* compiler(see 'printf' function). */
- /* */
- /* Nominally this is a sort program and can certainly be used for */
- /* that purpose. However, since the main benefit of using a B-tree is*/
- /* when using disk-based structures, it is quite possible that this */
- /* will not be your hot sort utility. It is relatively fast though */
- /* and I would be interested in any comparisons anyone makes. Later */
- /* effort is planned to build a virtual memory support package which */
- /* will then allow the 'nodes' of the b-tree to be stored as disk */
- /* sectors and at that point this program will allow sorting of */
- /* arbitrarily large text files. Again, that is still not the main */
- /* objective for it, information retrieval is. */
- /* */
- /* I will not attempt to give an introduction to B-trees here. */
- /* My primary reference and, in fact, the source of this program is */
- /* N. Wirth's wonderful book "Algorithms+Data Structures=Programs". */
- /* I passed this book up several times in the past, but when I was */
- /* looking for a treatment of this subject, I found that it had a */
- /* nice balance between explanation and example(the best way I learn)*/
- /* and since 'C' and PASCAL are 'kissin cousins', it was 'straight */
- /* forward' to do the transcribing. So go to and read section */
- /* 4.51 if you want to understand what is going on here. Also you can*/
- /* get a little contrast of the styles which result from the different*/
- /* features of the two languages(especially w re pointer variables). */
- /* */
- /* Things to note in this program which you may not already be */
- /* familiar with are the following: */
- /* */
- /* 1) A nested data structure(PAGE) is used. This is possible */
- /* in PASCAL and PL/I too and makes life a lot more pleasant. */
- /* Try doing the same thing in BASIC or FORTRAN, what a */
- /* mess! In a sense it would be easier in assembler. */
- /* */
- /* 2) Note that the tree structure used is a balanced one */
- /* so that no single branch gets long at the expense of */
- /* others. To see the depth level of the tree, turn on */
- /* the SPRINT parameter and note the first column of the */
- /* output on any sort. There is a trade-off here, though */
- /* in that it takes longer to build the tree than if it */
- /* were just a binary tree(not an AVL though). Thats why */
- /* b-trees are best for retrieval rather than simple */
- /* sorts. */
- /* */
- /* 3) The logic uses a rather interesting recursive structure */
- /* in which 'emerging' items are handed back through higher */
- /* levels until perhaps to the root of the tree. See the */
- /* variable called 'u' in function 'search'. */
- /* */
- /* 4) The parameter KEYPTR allows the option of including the */
- /* string storage area in the nodes themselves as opposed */
- /* to allocated areas. The latter is faster since no move- */
- /* ment of strings is necessary after they have been given */
- /* initial allocations. However, for disk-based use the */
- /* strings(keys) would need to be in the nodes. To see the */
- /* difference in performance(big!) just undefine(comment */
- /* out) this parameter. */
- /* */
- /* 5) For further perfomance comparison play with the FAST */
- /* parameter. If undefined, the compiler will build a */
- /* version of B-sort with a high-level version of two */
- /* block move operations. These same functions are imple- */
- /* mented in assembler code in the file BDSNEW.CSM. By */
- /* linking with that special library, you can get a version */
- /* of this program which runs quite a bit faster. Let me */
- /* know if you get any amazing results. */
- /* */
- /* 6) One peculiarity of the 'C' language came up in an earlier */
- /* version of B-sort, it would not sort itself! Clue, this */
- /* had to do with the way that 'printf' works. See if you */
- /* can figure out how this happened and how it was fixed. */
- /* */
- /* I hope this program is useful for some in learning more about the */
- /* 'C' language and an interesting algorithm(the key to better software). */
- /* If you make any improvements give me a copy. If you mutilate it, keep */
- /* it to yourself. */
- /* Jack Riley (303) 499-9169 RCPM, Boulder, CO. */
-
- /* 1983/10/14 09:45 */
- /* Adapted for IBM PC and Aztec C II Compiler by */
- /* William Hutchison, President */
- /* Algorithmic Technology, Inc. */
- /* P.O. Box 278 */
- /* Exton, PA 19341-0278 */
- /* CompuServe [70665,1307] */
- /* */
- /* Send an SASE if you would like a copy of our software catalog */
- /* */
-
- #include <algodef.h>
- #ifdef MS_DOS
- #define FCREAT(x,y) fopen(x,y)
- VOID
- pause()
- {
- int x;
-
- x= getchar();
- } /* end pause */
- #endif
-
- #define N 2 /* half-size of b-tree node */
-
- /* options for customizing the program */
-
- #undef FAST /* uses external 'move' functions for assemlber vs */
- #undef SPRINT /* provides statistics on keys in output */
- #define KEYPTR /* uses pointer references to strings instead of */
- /* arrays in the b-tree nodes */
- #undef DIAGNOSE /* turns on voluminous trace information on actions */
- /* taken by program logic.. perhaps instructive */
- /* special assigned values */
-
- #define KEYSIZE 80
- #define MAX_LEN 20
- #define MAXPRINT 3000
-
- /* dependent parameters */
-
- #define NN N+N
- #define NM1 N-1
- #define NM2 N-2
- #define NP1 N+1
-
- /* structure definitions */
-
- #define ITEM struct sitem
- #define PAGE struct spage
-
- ITEM {
- #ifndef KEYPTR
- char KEY[KEYSIZE];
- #endif
- #ifdef KEYPTR
- char *KEY;
- #endif
- PAGE *P;
- unsigned COUNT;
- }
- oneitem;
-
- PAGE {
- unsigned M;
- PAGE *P0;
- ITEM E[NN];
- }
- onepage;
-
- /* external variables */
-
- PAGE *root,*q;
- ITEM g;
-
- FILE *infile, *outfile;
- char infilnam[MAX_LEN], instrg[MAXLINE], o_flg;
-
- int sizitem,sizpage, maxcount, nkeys, tokcount;
-
- /* beginning of programs */
-
- use_err() /* Usage message: */
- {
- fprintf(STDERR,"\nERROR: Invalid parameter specification\n\n");
- fprintf(STDERR,"Usage: b-sort <filename> <flag(s)>\n\n");
- fprintf(STDERR," -o <filename> = Write output to named file\n");
- exit(0);
- }
-
- main(argc,argv)
- int argc;
- char **argv;
- {
-
- char *arg;
-
- o_flg=NO; /* output file flag */
-
- if (argc < 2) use_err();
-
- strcpy(infilnam,*++argv);
-
- if ((infile= fopen(infilnam, READ)) == NULL){
- fprintf(STDERR,"\nERROR: Unable to open input file: %s\n",infilnam);
- exit(1);
- }
-
- if (--argc > 0)
- if (*(arg=*++argv) == '-')
- if (tolower(*++arg) == 'o'){
- o_flg++;
- if (--argc == 0){
- fprintf(STDERR,"\nneed output file name");
- use_err();
- }
- if ((outfile= FCREAT(*++argv, WRITE)) == NULL){
- fprintf(STDERR,"ERROR: Unable to create output file - %s\n",
- *argv);
- exit(0);
- }
- }
-
- root=NULL;
- sizitem=sizeof(oneitem);
- sizpage=sizeof(onepage);
- tokcount=nkeys=0;
-
- #ifdef DIAGNOSE
- fprintf(STDERR,"\n&root=%x,g=%x,sizi=%d,sizp=%d",&root,g,sizitem,sizpage);
- #endif
-
- while (fgets(instrg, MAXLINE, infile)){
-
- if (trim(instrg) <= 0 ) continue;
-
- instrg[KEYSIZE-1]='\0';
-
- #ifdef DIAGNOSE
- fprintf(STDERR,"\n\nsearch key= %s",instrg);
- #endif
- if (search(instrg,root,&g) ){
- q=root;
- if ((root=alloc(sizpage)) == NULL){
- fprintf(STDERR,"\nERROR unable to allocate page");
- exit(2);
- }
- #ifdef DIAGNOSE
- fprintf(STDERR," root=%x, q=%x",root,q);
- #endif
-
- root->M=1;
- root->P0=q;
- moveb((char *)&root->E[0], (char *)&g, sizitem);
-
- }
- }
-
- #ifdef DIAGNOSE
- fprintf(STDERR,"\nEnd of input\n");
-
- fprintf(STDERR,"\nnumber of unique keys=%d, total key count=%d\n",
- nkeys,tokcount);
- if (!o_flg) pause();
- #endif
-
- maxcount=MAXPRINT;
- printtree(root,1);
-
- fprintf(STDERR,"\n");
- if (o_flg){
-
- fflush(outfile);
- fclose(outfile);
- }
- }
- char search(x,a,v)
- char *x;
- PAGE *a;
- ITEM *v;
- {
-
- int i,k,l,r,cmp;
- PAGE *q,*b;
- ITEM u;
- char *t;
-
- /* Search for key x on page a */
-
- if (a==NULL) /* ITEM with key x is not in tree */ {
-
- ++tokcount;
- ++nkeys;
- defkey(&v->KEY,x);
-
- #ifdef DIAGNOSE
- fprintf(STDERR,"\n a == null v(=%x)->KEY=%s",v,v->KEY);
- #endif
-
- v->COUNT=1;
- v->P=NULL;
- return (YES) ; /* YES means not found */
- }
- else {
- l=0;
- r=a->M-1; /* binary array search */
- do {
- k=(l+r)/2;
- cmp= strcmp(x,t=(a->E[k].KEY));
-
- #ifdef DIAGNOSE
- fprintf(STDERR,"\ncmp=%d,r=%d,l=%d,a(=%x)->P0=%x/E[k=%d].P=%x/E[k].KEY=%x=%s",
- cmp,r,l,a,a->P0,k,a->E[k].P,t,t );
- #endif
-
- if (cmp <= 0) r=k-1;
- if (cmp >= 0) l=k+1;
-
- }
- while (r >= l );
-
- if (cmp == 0 ) /* found it, bump counter */ {
- ++tokcount;
- ++a->E[k].COUNT;
- return(NO);
- }
-
- else /* test if item is not on this page */ {
-
- q = (r < 0 ) ? a->P0 : a->E[r].P;
-
- if (!search(x,q,&u) ) return(NO);
- }
- }
-
- /* ---- insert an item */
-
- if (a->M < NN){
- /* page not full yet, add to it. 'Bump' items from r+1 to
- M-1 */
- movdnb((char *)&a->E[r+2], (char *)&a->E[r+1],
- sizitem*((a->M++)-r-1) );
- moveb((char *)&a->E[r+1], (char *)&u, sizitem);
-
- return(NO);
- }
- else {
- /* page full, split it and push center item upward in tree */
- if ((b = alloc(sizpage)) == NULL )
- fprintf(STDERR,"\nOut of memory");
-
- #ifdef DIAGNOSE
- fprintf(STDERR,"\n\n ****** new node at %x",b);
- #endif
-
- if (r <= NM1 ) /* put new item in old page */ {
-
- if (r == NM1 ) moveb((char *)v, (char *)&u, sizitem );
- else {
- /* 'bump' down items from r+2 to N-1 */
-
- moveb((char *)v, (char *)&a->E[NM1], sizitem);
- movdnb((char *)&a->E[r+2], (char *)&a->E[r+1],
- sizitem*(NM2-r));
- moveb((char *)&a->E[r+1], (char *)&u,sizitem);
- }
- moveb((char *)&b->E[0], (char *)&a->E[N], sizitem*N);
- }
- else {
- /* move upper N items and new item to new page */
-
- moveb((char *)v, (char *)&a->E[N], sizitem ) ;
-
- if ((r = r - N) > 0 )
- moveb((char *)&b->E[0],
- (char *)&a->E[NP1], sizitem*r);
-
- moveb((char *)&b->E[r], (char *)&u, sizitem);
-
- if ((i = NM1-r) > 0 )
- moveb((char *)&b->E[r+1],
- (char *)&a->E[NP1+r], sizitem * i);
- }
-
- a->M = b->M = N ;
- b->P0 = v->P;
- v->P = b ;
- }
-
- return (YES);
- }
- trim(strg)
- char *strg;
- {
-
- int l;
-
- l=strlen(strg);
- while (strg[l] <= ' ' ){
- if (l <= 0 ) break;
- strg[l--]='\0';
- }
- return(l);
- }
- moveb(a,b,c)
- char *a,*b;
- int c;
- {
-
- int i;
- #ifdef FAST /* use potentially faster move routine */
- move(a,b,c);
- return;
- #else
- /* for (i=0 ; i < c ; ++i ) a[i]=b[i] ; */
- while (c-- > 0)
- *a++= *b++;
- #endif
- }
- movdnb(a,b,c)
- char *a,*b;
- int c;
- {
-
- int i;
- #ifdef FAST /* use potentially faster move routine */
- movdn(a,b,c);
- return;
- #else
- /* for (; --c >= 0 ; c ) a[c]=b[c] ; */
- for (a+= c, b+= c; c-- > 0; )
- *--a= *--b;
- #endif
- }
-
- printtree(p,l)
- PAGE *p;
- int l;
- {
-
- int i,j;
- char *t;
-
- if (maxcount <= 0) return;
-
- if (p != NULL ){
-
-
- printtree(p->P0, l+1 );
-
- for (i=0; i <= (j=p->M-1) ; ++i ){
- --maxcount;
- fprintf(STDERR,"\n");
- #ifdef SPRINT
- fprintf(STDERR," %d %d ",l,p->E[i].COUNT );
- #endif
- prints(p->E[i].KEY);
-
- printtree(p->E[i].P,l+1);
- }
- }
- }
- defkey(a,b)
- char **a,*b;
- {
-
- #ifdef KEYPTR
- if ((*a=alloc(strlen(b)+1)) == NULL ){
- fprintf(STDERR,"\ninsufficient string storage in defkey\n");
- exit(3);
- }
- strcpy(*a,b);
- #endif
- #ifndef KEYPTR
- strcpy(a,b);
- #endif
- }
- prints(str)
- char *str;
- {
-
- if (o_flg)
- fputs(str,outfile);
- else
- fputs(str,STDOUT); /* and print out the line */
- }