home *** CD-ROM | disk | FTP | other *** search
- /*
- * usage: a_len [interval] file_name[s]
- *
- * A_LEN Analyze the distribution of line lengths up to 1024 bytes
- * within any file. The reporting interval (an integer from
- * 1 to 100) is a count of the lengths that will be grouped
- * together. For example, an interval of 10 means that
- * frequencies of length 0, length 1-10, length 11-20, etc.
- * are shown in the report. The default interval is 10. If
- * the first file name starts with numeric digits, show the
- * interval first!
- *
- * input: Any ASCII file[s].
- *
- * output: file_name.len which reports the frequency of line lengths
- * occuring in the file. Lengths exclude carriage returns and
- * line feeds.
- *
- * writeup: MIR TUTORIAL ONE, topic 6
- *
- * written: Douglas Lowry Feb 13 92
- * modified: Douglas Lowry Feb 14 92
- * Douglas Lowry Mar 20 92 Cycle thru report names
- * Copyright (C) 1992 Marpex Inc.
- * Compiled with STACK = 8000
- * NOTE THE PROGRAM COMPILES MUCH SMALLER (10,000 INSTEAD OF 19,200 BYTES)
- * IF THE USE OF DOUBLE TYPE IS OMITTED IN THE "REPORT" FUNCTION.
- *
- * The MIR (Mass Indexing and Retrieval) Tutorials explain detailed
- * usage and co-ordination of the MIR family of programs to analyze,
- * prepare and index databases (small through gigabyte size), and
- * how to build integrated retrieval software around the MIR search
- * engine. The fifth of the five MIR tutorial series explains how
- * to extend indexing capability into leading edge search-related
- * technologies. For more information, GO IBMPRO on CompuServe;
- * MIR files are in the DBMS library. The same files are on the
- * Canada Remote Systems BBS. A diskette copy of the Introduction
- * is available by mail ($10 US... check, Visa or Mastercard);
- * diskettes with Introduction, Tutorial ONE software and the
- * shareware Tutorial ONE text cost $29. Shareware registration
- * for a tutorial is also $29.
- *
- * E-mail...
- * Compuserve 71431,1337
- * Internet doug.lowry%canrem.com
- * UUCP canrem!doug.lowry
- * Others: doug.lowry@canrem.uucp
- *
- * FAX... 416 963-5677
- *
- * "Snail mail"... Douglas Lowry, Ph.D.
- * Marpex Inc.
- * 5334 Yonge Street, #1102
- * North York, Ontario
- * Canada M2N 6M2
- *
- * Related database consultation and preparation services are
- * available through:
- * Innotech Inc., 2001 Sheppard Avenue E., Suite #118,
- * North York, Ontario Canada M2J 4Z7
- * Tel. 416 492-3838 FAX 416 492-3843
- *
- * This program is free software; you may redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * (file 05LICENS) along with this program; if not, write to the
- * Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
- * USA.
- */
-
- #include <stdio.h>
- #include <sys\types.h>
- #include <sys\stat.h>
-
- #define repeat for(;;)
-
- typedef enum bool
- { FALSE = 0, TRUE = 1 } Bool ;
-
- #define MAX_BYTES 2048 /* # of bytes in input buffer */
-
- void process(), Usage_(), report(), non_exist() ;
- char *Cmdname_() { return( "a_len" ) ; }
-
- /*
- * MAIN -
- */
-
- main( argc, argv )
- int argc;
- char **argv;
- {
- FILE *fp, *fp_out;
- char fname[20],
- c10 ;
- int interval, /* argument 1 - 100 */
- from, /* argument # of first file name*/
- file, len, i ;
- long int cum[ 1026 ]; /* accumulator for each byte */
- /* Those over 1024 are counted in [1025] */
-
- c10 = argv[1][0] ;
- if( argc < 2 || c10 == '-' || c10 == '/' || c10 == '?' )
- Usage_();
-
- interval = 10 ;
- from = 1 ;
- i = atoi( argv[1] ) ;
- if( i > 0 && i < 101 )
- {
- interval = i ;
- from = 2 ;
- }
- else if( i > 100 )
- Usage_() ;
-
- for( file = from; file < argc ; file++ )
- {
- if(( fp = fopen( argv[ file ], "r+b" )) == NULL )
- {
- fprintf( stderr, "Can't open file %s\n", argv[ file ] );
- exit( 1 );
- }
- process( fp, cum );
- if( fclose( fp ))
- fprintf( stderr, "Problem closing %s\n", argv[ file ] );
-
- sprintf( fname, "%s.len", argv[ file ] );
- len = strlen( fname ) ;
- for( i = 0 ; i < len ; i++ )
- {
- if( fname[i] == '.' )
- {
- if( i != len - 3 )
- strncpy( &fname[i+1], "len", 3 );
- fname[i+4] = '\0' ;
- break ;
- }
- }
-
- non_exist( fname ) ;
- if(( fp_out = fopen( fname, "w" )) == NULL )
- {
- fprintf( stderr, "Can't open file %s\n", fname );
- exit( 1 );
- }
-
- report( fp_out, cum, interval, fname ) ;
-
- if( fclose( fp_out ))
- fprintf( stderr, "Problem closing %s\n", fname );
- }
-
- exit( 0 );
- }
- void
- Usage_()
- {
- fprintf( stderr, "\nusage: %s [interval] file_name[s]\n\n\
- Analyze the distribution of line lengths up to 1024 bytes\n\
- within any file. The reporting interval (an integer from\n\
- 1 to 100) is a count of the lengths that will be grouped\n",
- Cmdname_() );
- fprintf( stderr,
- " together. For example, an interval of 10 means that\n\
- frequencies of length 0, length 1-10, length 11-20, etc.\n\
- are shown in the report. The default interval is 10. If\n\
- the first file name starts with numeric digits, show the\n" );
- fprintf( stderr,
- " interval first!\n\n\
- input: Any ASCII file[s].\n\n\
- output: file_name.len which reports the frequency of line lengths\n\
- occuring in the file. Lengths exclude carriage returns and\n\
- line feeds.\n\n\
- writeup: MIR TUTORIAL ONE, topic 6\n\n" ) ;
- exit( 1 ) ;
- }
- /*
- * PROCESS
- */
- void
- process( fp, cum )
- FILE *fp ;
- long int cum[ 1026 ]; /* accumulator for each length */
- {
- char line_in[ MAX_BYTES ] ;
- int len, i ;
- Bool after_nl; /* after a new line */
-
- for( i = 0 ; i < 1026 ; i++ )
- cum[ i ] = 0 ;
-
- while( fgets( line_in, MAX_BYTES, fp ) != NULL )
- {
- len = strlen( line_in );
- while( line_in[ len-1 ] == '\n' || line_in[ len-1 ] == '\015' )
- len-- ;
- if( len > MAX_BYTES - 4 )
- {
- fprintf( stderr, "FATAL... Line exceeds %d bytes.\n\n",
- len );
- exit( 1 );
- }
- if( len > 1024 )
- cum[ 1025 ]++ ;
- else
- cum[ len ]++ ;
- }
-
- return;
- }
- /*
- * REPORT - Write the analysis
- */
- void
- report( fp_out, cum, interval, fname )
- FILE *fp_out ;
- long int cum[ 1026 ]; /* accumulator for each byte */
- /* Those over 1024 are counted in [1025] */
- int interval; /* argument 1 - 100 */
- char fname[20];
- {
- Bool foul_up ;
- double pct,
- d_lines ; /* count of lines */
- long int group_cum, /* frequency across interval */
- lines ;
- int pt, i,
- low, high; /* ends of interval */
-
- foul_up = FALSE ;
- lines = 0 ;
- for( i = 0 ; i < 1026 ; i++ )
- lines += cum[ i ] ;
- d_lines = ( double ) lines ;
-
- if( interval == 1 )
- {
- if( cum[0] )
- {
- pct = 100.0 * (( double ) cum[0] / d_lines ) ;
- fprintf( fp_out, " 0:%7ld %4.1f%%\n", cum[0], pct ) ;
- }
-
- for( pt = 1 ; pt < 1025 ; pt++ )
- {
- if( cum[ pt ] )
- {
- pct = 100.0 * (( double ) cum[ pt ] / d_lines ) ;
- if( !fprintf( fp_out, " %4d:%7ld %4.1f%%\n", pt, cum[pt],
- pct ))
- {
- foul_up = TRUE ;
- break ;
- }
- }
- }
- if( cum[1025] )
- {
- pct = 100.0 * (( double ) cum[ 1025 ] / d_lines ) ;
- fprintf( fp_out, "1025+:%7ld %4.1f%%\n", cum[1025] ) ;
- }
- }
- else /* wider interval */
- {
- if( cum[0] )
- {
- pct = 100.0 * (( double ) cum[0] / d_lines ) ;
- fprintf( fp_out, " 0:%7ld %4.1f%%\n", cum[0], pct ) ;
- }
-
- for( pt = 1 ; pt < 1025 ; pt++ )
- {
- low = pt ;
- group_cum = 0 ;
- for( i = 0 ; i < interval ; i++ )
- {
- group_cum += cum[ pt++ ] ;
- if( pt > 1024 )
- break ;
- }
- high = pt - 1 ;
- pt = high ;
- if( group_cum )
- {
- pct = 100.0 * (( double ) group_cum / d_lines ) ;
- if( !fprintf( fp_out, "%4d -%4d:%7ld %4.1f%%\n", low,
- high, group_cum, pct ))
- foul_up = TRUE ;
- }
- if( foul_up )
- break ;
- }
-
- if( group_cum )
- {
- pct = 100.0 * (( double ) group_cum / d_lines ) ;
- fprintf( fp_out, "%4d -1024:%7ld %4.1f%%\n", group_cum, pct );
- }
- if( cum[1025] )
- {
- pct = 100.0 * (( double ) cum[1025] / d_lines ) ;
- fprintf( fp_out, " Over 1024:%7ld %4.1f%%\n", cum[1025],pct );
- }
- }
-
- if( foul_up )
- fprintf( stderr, "Unable to write report in file %s\n", fname );
- else
- fprintf( stderr, "\n%ld lines reported in file %s\n\n", lines,
- fname );
-
- return ;
- }
- /*
- * NON_EXIST Test the existence of a file; if it exists,
- * substitute digits successively for the last
- * byte in the name until a non-existent file
- * is named, or until the last digit is '9'; do
- * the same with the second last byte, for 100
- * possible combinations
- */
- void
- non_exist( fname )
- char fname[] ;
- {
- struct stat buf;
- Bool gotcha ;
- int result,
- decade, /* batch of 10 names */
- len, i ;
-
- result = stat( fname, &buf );
- if( !result ) /* data obtained = that file exists */
- {
- len = strlen( fname ) ;
-
- for( decade = 0 ; decade < 10 ; decade++ )
- {
- gotcha = FALSE ;
- if( decade )
- {
- if( fname[len-2] == '.' )
- break ; /* Don't mess with one digit
- name extension */
- fname[ len - 2 ] = '0' + decade ;
- }
- fname[ len - 1 ] = '0' ;
-
- /* Try names ending in 0 through 9 */
-
- for( i = 0 ; i < 10 ; i++ )
- {
- result = stat( fname, &buf );
- if( result )
- {
- gotcha = TRUE ;
- break ;
- }
- if( i < 9 )
- fname[ len - 1 ] += 1 ;
- }
- if( gotcha )
- break ;
- }
-
- }
- return ;
- }