home *** CD-ROM | disk | FTP | other *** search
- /*
- * usage - a_occur [ min_freq ] [ /n ] < ascii_text > report
- * /n = non-sequenced data is okay
- *
- * A_OCCUR Count the frequency of occurrence of identical lines
- * If a minimum frequency is specified, lines occurring
- * fewer times are dropped entirely from the result.
- *
- * Input: ASCII text, which must be in sorted order UNLESS the
- * flag "/n" is included.
- *
- * Output: A reduced copy of the file with each line shown only
- * once. Each line begins with a frequency count, padded
- * out to six characters with blanks.
- *
- * Writeup: MIR TUTORIAL ONE, topic five.
- * See also the related programs A_OCCUR2 and A_OCCUR3.
- *
- * Written: Douglas Lowry Mar 04 87
- * Modified: Douglas Lowry Apr 30 92 Reworked entirely
- * Copyright (C) 1992 Innotech Inc.
- *
- * The MIR (Mass Indexing and Retrieval) Tutorials explain detailed
- * usage and co-ordination of the MIR family of programs to analyze,
- * prepare and index databases (small through gigabyte size), and
- * how to build integrated retrieval software around the MIR search
- * engine. The fifth of the five MIR tutorial series explains how
- * to extend indexing capability into leading edge search-related
- * technologies. For more information, GO IBMPRO on CompuServe;
- * MIR files are in the DBMS library. The same files are on the
- * Canada Remote Systems BBS. A diskette copy of the Introduction
- * is available by mail ($10 US... check, Visa or Mastercard);
- * diskettes with Introduction, Tutorial ONE software and the
- * shareware Tutorial ONE text cost $29. Shareware registration
- * for a tutorial is also $29.
- *
- * E-mail...
- * Compuserve 71431,1337
- * Internet doug.lowry%canrem.com
- * UUCP canrem!doug.lowry
- * Others: doug.lowry@canrem.uucp
- *
- * FAX... 416 963-5677
- *
- * "Snail mail"... Douglas Lowry, Ph.D.
- * Marpex Inc.
- * 5334 Yonge Street, #1102
- * North York, Ontario
- * Canada M2N 6M2
- *
- * Related database consultation and preparation services are
- * available through:
- * Innotech Inc., 2001 Sheppard Avenue E., Suite #118,
- * North York, Ontario Canada M2J 4Z7
- * Tel. 416 492-3838 FAX 416 492-3843
- *
- * This program is free software; you may redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * (file 05LICENS) along with this program; if not, write to the
- * Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
- * USA.
- */
-
- #include <stdio.h>
- #include <stdlib.h>
-
- #define MAX_BYTES 512
- #define repeat for(;;)
-
- /*
- * declarations
- */
-
- typedef enum _bool
- { FALSE = 0, TRUE = 1 } Bool;
-
- void Usage_(), process();
- char *Cmdname_() { return( "a_occur" ); }
-
- /*
- * MAIN
- */
-
- main( argc, argv )
- int argc;
- char **argv;
- {
- Bool must_be_seq ; /* Must be sequential ASCII order */
- int min_freq, /* threshold frequency to show a line */
- i, val;
-
- min_freq = 1;
- must_be_seq = TRUE;
-
- if( argc > 3 )
- Usage_();
-
- for( i = 1 ; i < argc ; i++ )
- {
- if(( val = atoi( argv[i] )))
- min_freq = val;
- else if(( argv[i][0] == '-' || argv[i][0] == '/' ) &&
- ( argv[i][1] == 'n' || argv[i][1] == 'N' ))
- must_be_seq = FALSE;
- else
- Usage_();
- }
-
- process( min_freq, must_be_seq ) ;
-
- exit( 0 ) ;
- }
- /*
- * Usage_
- */
- void
- Usage_()
- {
- fprintf( stderr,
- "\nusage: %s [ min_freq ] [ /n ] < ascii_text > report\n\
- /n = non-sequenced data is okay\n\n\
- Count the frequency of occurrence of identical lines\n\
- If a minimum frequency is specified, lines occurring\n",
- Cmdname_() );
- fprintf( stderr,
- " fewer times are dropped entirely from the result.\n\n\
- Input: ASCII text, which must be in sorted order UNLESS the\n\
- flag \"/n\" is included.\n\n\
- Output: A reduced copy of the file with each line shown only\n\
- once. Each line begins with a frequency count, padded\n" );
- fprintf( stderr,
- " out to six characters with blanks.\n\n\
- Writeup: MIR TUTORIAL ONE, topic five.\n\
- See also the related programs A_OCCUR2 and A_OCCUR3.\n\n" ) ;
- exit( 1 ) ;
- }
- /*
- * PROCESS
- */
- void
- process( min_freq, must_be_seq )
- int min_freq ;
- Bool must_be_seq ; /* must be sequential order (default) */
- {
- char buf[2][MAX_BYTES]; /* alternating line inputs */
- Bool done, /* last line has been read */
- same; /* 2 successive lines identical */
- long int
- freq, /* count of occurrences of line */
- sizer ;
- int this, /* current buffer is 0 or 1 */
- that, /* other buffer is 1 or 0 */
- lines_in, /* count */
- len[2], /* line length of each buffer */
- i ;
-
- len[0] = len[1] = freq = lines_in = 0;
- done = FALSE;
- this = 0;
- that = 1;
-
- while( !done )
- {
- if( fgets( buf[this], MAX_BYTES, stdin ) == NULL )
- done = TRUE;
- lines_in++ ;
- len[this] = strlen( buf[this] ) - 1 ;
- while( isspace( buf[this][len[this]-1] ))
- len[this] -= 1 ;
- if( len[this] > MAX_BYTES - 3 )
- {
- fprintf( stderr, "FATAL... Line length exceeds %d bytes.\n\n",
- MAX_BYTES ) ;
- exit( 1 ) ;
- }
- buf[this][len[this]] = '\0' ;
- if( done || len[this] < 0 )
- len[this] = 0;
-
- same = FALSE; /* compare 2 consecutive lines */
- if( len[this] == len[that] )
- {
- same = TRUE;
- for( i = 0; i < len[0]; i++ )
- {
- if( buf[0][i] != buf[1][i] )
- {
- same = FALSE;
- if( must_be_seq && buf[this][i] < buf[that][i] )
- {
- fprintf( stderr,
- "Not sorted... lines %d and %d\n%s\n%s\n", lines_in - 1, lines_in,
- buf[this], buf[that] );
- Usage_();
- }
- break;
- }
- }
- }
-
- if( same )
- freq++;
- else /* if not same, print */
- {
- if( freq >= min_freq )
- {
- printf( "%d", freq ) ;
- sizer = freq ;
- while( sizer < 100000 )
- {
- putchar( ' ' ) ;
- sizer *= 10 ;
- }
- if( !printf( "%s\n", buf[that] ))
- {
- fprintf( stderr, "FATAL... unable to write.\n\n" ) ;
- exit( 1 ) ;
- }
- }
- freq = 1;
- this = that;
- if( this )
- that = 0;
- else
- that = 1;
- }
- }
-
- return;
- }