home *** CD-ROM | disk | FTP | other *** search
- /*
- * usage - a_occur2 [ min_frequency [ filename_under_min ] ]
- * < merged a_occur files > combined
- *
- * A_OCCUR2 A utility to calculate cumulative frequency of
- * merged A_OCCUR outputs. If a minimum frequency is
- * specified, then all lower frequency items are either
- * suppressed or sent to a file named in the next argument.
- *
- * Input: ASCII text, in which each line starts with a number
- * (a frequency count) followed by blanks, then sorted text
- * starting in the seventh column.
- *
- * Output: A copy of the same file in which multiple identical lines
- * are shown only once, preceded by the combined frequency
- * count.
- *
- * Writeup: MIR TUTORIAL ONE, topic five.
- *
- * Written: Douglas Lowry Oct 28 87
- * Modified: Douglas Lowry Apr 30 92 Reworked entirely
- * Copyright (C) 1992 Innotech Inc.
- *
- * The MIR (Mass Indexing and Retrieval) Tutorials explain detailed
- * usage and co-ordination of the MIR family of programs to analyze,
- * prepare and index databases (small through gigabyte size), and
- * how to build integrated retrieval software around the MIR search
- * engine. The fifth of the five MIR tutorial series explains how
- * to extend indexing capability into leading edge search-related
- * technologies. For more information, GO IBMPRO on CompuServe;
- * MIR files are in the DBMS library. The same files are on the
- * Canada Remote Systems BBS. A diskette copy of the Introduction
- * is available by mail ($10 US... check, Visa or Mastercard);
- * diskettes with Introduction, Tutorial ONE software and the
- * shareware Tutorial ONE text cost $29. Shareware registration
- * for a tutorial is also $29.
- *
- * E-mail...
- * Compuserve 71431,1337
- * Internet doug.lowry%canrem.com
- * UUCP canrem!doug.lowry
- * Others: doug.lowry@canrem.uucp
- *
- * FAX... 416 963-5677
- *
- * "Snail mail"... Douglas Lowry, Ph.D.
- * Marpex Inc.
- * 5334 Yonge Street, #1102
- * North York, Ontario
- * Canada M2N 6M2
- *
- * Related database consultation and preparation services are
- * available through:
- * Innotech Inc., 2001 Sheppard Avenue E., Suite #118,
- * North York, Ontario Canada M2J 4Z7
- * Tel. 416 492-3838 FAX 416 492-3843
- *
- * This program is free software; you may redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * (file 05LICENS) along with this program; if not, write to the
- * Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
- * USA.
- */
-
- #include <stdio.h>
- #include <stdlib.h>
-
- #define MAX_BYTES 512
- #define repeat for(;;)
-
- /*
- * declarations
- */
-
- typedef enum _bool
- { FALSE = 0, TRUE = 1 } Bool;
-
- void Usage_(), process();
- char *Cmdname_() { return( "a_occur2" ); }
-
- /*
- * MAIN
- */
-
- main( argc, argv )
- int argc;
- char **argv;
- {
- FILE *fp_out ; /* for under minimum listing */
- char c10 ;
- int min_occur ;
-
- min_occur = 0;
- fp_out = NULL;
-
- if( argc > 1 )
- {
- c10 = argv[1][0] ;
- if( argc > 3 || c10 == '-'|| c10 == '/' || c10 == '?' )
- Usage_();
-
- if(( min_occur = atoi( argv[1] )) < 1 )
- {
- fprintf( stderr,"First argument expects a number > 0.\n\n" );
- Usage_();
- }
- }
-
- if( argc > 2 )
- {
- if(( fp_out = fopen( argv[2], "w" )) == NULL )
- {
- fprintf( stderr, "\nUnable to open file %s.\n",
- argv[2] ) ;
- Usage_() ;
- }
- }
-
- process( min_occur, fp_out ) ;
-
- if( fp_out != NULL )
- {
- if( fclose( fp_out ))
- fprintf( stderr, "Trouble closing file %s\n", argv[2] );
- }
- exit( 0 ) ;
- }
- /*
- * Usage_
- */
- void
- Usage_()
- {
- fprintf( stderr,
- "\nusage: %s [ min_frequency [ filename_under_min ] ]\n\
- < merged a_occur files > combined\n\n\
- A utility to calculate cumulative frequency of\n\
- merged A_OCCUR outputs. If a minimum frequency is\n",
- Cmdname_() ) ;
- fprintf( stderr,
- " specified, then all lower frequency items are either\n\
- suppressed or sent to a file named in the next argument.\n\n\
- Input: ASCII text, in which each line starts with a number\n\
- (a frequency count) followed by blanks, then sorted text\n\
- starting in the seventh column.\n\n" ) ;
- fprintf( stderr,
- "Output: A copy of the same file in which multiple identical lines\n\
- are shown only once, preceded by the combined frequency\n\
- count.\n\n\
- Writeup: MIR TUTORIAL ONE, topic five.\n\n" ) ;
- exit( 1 ) ;
- }
- /*
- * PROCESS
- */
- void
- process( min_occur, fp_out )
- FILE *fp_out ; /* for under minimum listing */
- int min_occur ;
- {
- char buf[2][MAX_BYTES]; /* alternating line inputs */
- Bool done, /* last line has been read */
- same; /* 2 successive lines identical */
- long int
- freq[2], /* count of occurences of line */
- sizer ;
- int this, /* current buffer is 0 or 1 */
- that, /* other buffer is 1 or 0 */
- lines_in, /* count */
- len[2], /* line length of each buffer */
- i;
-
- len[0] = len[1] = freq[0] = freq[1] = lines_in = 0;
- done = FALSE;
- this = 0;
- that = 1;
-
- while( !done )
- {
- if( fgets( buf[this], MAX_BYTES, stdin ) == NULL )
- done = TRUE;
- freq[this] = atol( buf[this] ) ;
- if( !freq[ this ] )
- {
- fprintf( stderr, "No frequency beginning line %d...\n%s\n",
- lines_in, buf[ this ] );
- exit( 1 ) ;
- }
- lines_in++ ;
- len[this] = strlen( buf[this] ) - 1 ;
- while( isspace( buf[this][len[this]-1] ))
- len[this] -= 1 ;
- buf[this][len[this]] = '\0'; /* replace linefeed */
- if( done || len[this] < 0 )
- len[this] = 0;
-
- same = FALSE; /* compare 2 consecutive lines */
- if( len[this] == len[that] )
- {
- same = TRUE;
- for( i = 6; i < len[0]; i++ )
- {
- if( buf[0][i] != buf[1][i] )
- {
- same = FALSE;
- if( buf[this][i] < buf[that][i] )
- {
- fprintf( stderr,
- "Not sorted... lines %d and %d\n%s\n%s\n",
- lines_in - 1, lines_in, buf[this], buf[that] );
- Usage_() ;
- }
- break;
- }
- }
- }
-
- if( same )
- freq[ that ] += freq[ this ];
- else /* if not same, print */
- {
- if( freq[ that ] )
- {
- if( freq [ that ] >= min_occur )
- {
- printf( "%ld", freq[ that ] ) ;
- sizer = freq[ that ] ;
- while( sizer < 100000 )
- {
- putchar( ' ' ) ;
- sizer *= 10 ;
- }
- if( !printf( "%s\n", &buf[that][6] ))
- {
- fprintf( stderr, "Writing failure after %d lines\n",
- lines_in );
- exit( 1 ) ;
- }
- }
- else if( min_occur && fp_out != NULL )
- {
- fprintf( fp_out, "%ld", freq[ that ] ) ;
- sizer = freq[ that ] ;
- while( sizer < 100000 )
- {
- fputc( ' ', fp_out ) ;
- sizer *= 10 ;
- }
- if( !fprintf( fp_out, "%s\n", &buf[that][6] ))
- {
- fprintf( stderr, "Writing failure after %d lines\n",
- lines_in );
- exit( 1 ) ;
- }
- }
- }
- this = that;
- if( this )
- that = 0;
- else
- that = 1;
- }
- }
-
- return ;
- }