home *** CD-ROM | disk | FTP | other *** search
- /*
- * usage: a_bytes [ /L ] file_name[s]
- *
- * A_BYTES Analyze the bytes (characters) used within any file, report
- * the frequency of each byte present. If the location flag /L
- * is set, include offsets of the first 8 occurrences of each
- * byte pattern present.
- *
- * input: Any file[s] whatsoever.
- *
- * output: file_name.BYT which contains up to 256 lines, one line for
- * each different byte present. The byte is shown first in
- * printable OR octal form, then the hexadecimal equivalent.
- * The third column is frequency. The fourth column shows
- * percentage of total occurrences within the file.
- *
- * If the /L locations option is selected, the output file is
- * name file_name.LOC and the offsets of the first up to 8
- * occurrences follow at the end of each line.
- *
- * writeup: MIR TUTORIAL ONE, topic 5
- * Compiled with STACK = 16000
- *
- * written: Douglas Lowry Jan 04 92
- * modified: Douglas Lowry Feb 15 92
- * Mar 20 92 Ten alternative report names
- * Copyright (C) 1992 Marpex Inc.
- *
- * The MIR (Mass Indexing and Retrieval) Tutorials explain detailed
- * usage and co-ordination of the MIR family of programs to analyze,
- * prepare and index databases (small through gigabyte size), and
- * how to build integrated retrieval software around the MIR search
- * engine. The fifth of the five MIR tutorial series explains how
- * to extend indexing capability into leading edge search-related
- * technologies. For more information, GO IBMPRO on CompuServe;
- * MIR files are in the DBMS library. The same files are on the
- * Canada Remote Systems BBS. A diskette copy of the Introduction
- * is available by mail ($10 US... check, Visa or Mastercard);
- * diskettes with Introduction, Tutorial ONE software and the
- * shareware Tutorial ONE text cost $29. Shareware registration
- * for a tutorial is also $29.
- *
- * E-mail...
- * Compuserve 71431,1337
- * Internet doug.lowry%canrem.com
- * UUCP canrem!doug.lowry
- * Others: doug.lowry@canrem.uucp
- *
- * FAX... 416 963-5677
- *
- * "Snail mail"... Douglas Lowry, Ph.D.
- * Marpex Inc.
- * 5334 Yonge Street, #1102
- * North York, Ontario
- * Canada M2N 6M2
- *
- * Related database consultation and preparation services are
- * available through:
- * Innotech Inc., 2001 Sheppard Avenue E., Suite #118,
- * North York, Ontario Canada M2J 4Z7
- * Tel. 416 492-3838 FAX 416 492-3843
- *
- * This program is free software; you may redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * (file 05LICENS) along with this program; if not, write to the
- * Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
- * USA.
- */
-
- #include <stdio.h>
- #include <fcntl.h>
- #include <sys\types.h>
- #include <sys\stat.h>
- #include <io.h>
-
- #define repeat for(;;)
-
- typedef enum bool
- { FALSE = 0, TRUE = 1 } Bool ;
-
- #define INTAKE 2048 /* # of bytes in input buffer */
-
- void process(), Usage_(), report(), non_exist() ;
- char *Cmdname_() { return( "a_bytes" ) ; }
-
- /*
- * MAIN -
- */
-
- main( argc, argv )
- int argc;
- char **argv;
- {
- Bool foul_up,
- locations ; /* requested by user */
- char c10; /* argv[1][0] */
- int fd, /* file descriptor */
- file, bgn_at ;
- long int cum[ 256 ], /* accumulator for each byte */
- locn[ 256 ][ 8 ]; /* offset of 8 occurrences of each */
-
- locations = FALSE ;
- if( argv[1][1] == 'l' )
- argv[1][1] = 'L' ;
- c10 = argv[1][0] ;
- if(( c10 == '-' || c10 == '/' ) && argv[1][1] == 'L' )
- locations = TRUE ;
- else
- {
- if( argc == 1 || c10 == '-' || c10 == '/' || c10 == '?' )
- Usage_();
- }
-
- bgn_at = 1 ;
- if( locations )
- bgn_at = 2 ;
-
- for( file = bgn_at ; file < argc ; file++ )
- {
- if(( fd = open( argv[ file ], O_RDONLY | O_BINARY )) == -1 )
- {
- fprintf( stderr, "Can't open file %s\n", argv[ file ] );
- exit( 1 );
- }
-
- process( fd, cum, locn, locations );
-
- report( cum, locn, locations, argv[file] ) ;
-
- if( close( fd ))
- fprintf( stderr, "Problem closing %s\n", argv[ file ] );
- }
-
- exit( 0 );
- }
- void
- Usage_()
- {
- fprintf( stderr, "\nusage: %s [ /L ] file_name[s]\n\n\
- Analyze the bytes (characters) used within any file, report\n\
- the frequency of each byte present. If the location flag /L\n\
- is set, include offsets of the first 8 occurrences of each\n",
- Cmdname_() );
- fprintf( stderr, " byte pattern present.\n\n\
- input: Any file[s] whatsoever.\n\n\
- output: file_name.BYT which contains up to 256 lines, one line for\n\
- each different byte present. The byte is shown first in\n\
- printable OR octal form, then the hexadecimal equivalent.\n" );
- fprintf( stderr,
- " The third column is frequency. The fourth column shows\n\
- percentage of total occurrences within the file.\n\n\
- If the /L locations option is selected, the output file is\n\
- name file_name.LOC and the offsets of the first up to 8\n\
- occurrences follow at the end of each line.\n\n" ) ;
- fprintf( stderr, "writeup: MIR TUTORIAL ONE, topic 5\n\n" );
- exit( 1 ) ;
- }
- /*
- * PROCESS
- */
- void
- process( fd, cum, locn, locations )
- int fd; /* file descriptor */
- long int cum[ 256 ], /* accumulator for each byte */
- locn[ 256 ][ 8 ]; /* offset of 8 occurrences of each */
- Bool locations ; /* TRUE if selected by user */
- {
- unsigned char buf_in[ INTAKE ];
- register int buf_len,
- i;
- long int gross_locn, /* count of cumulative intakes */
- fine_locn,
- ct ;
-
- for( i = 0; i < 256 ; i++ )
- cum[ i ] = 0;
- gross_locn = 0 ;
-
- repeat
- {
- fine_locn = 0 ;
- if ( ( buf_len = read( fd, buf_in, INTAKE ) ) == 0 )
- break;
-
- for ( i= 0 ; i < buf_len ; i++ )
- {
- cum[ buf_in[ i ] ]++ ;
- if( locations && cum[ buf_in[ i ] ] < 9 )
- {
- ct = cum[ buf_in[ i ] ] - 1 ;
- locn[ buf_in [ i ] ][ ct ] = gross_locn + fine_locn ;
- }
- fine_locn++ ;
- }
-
- gross_locn += buf_len ;
- }
-
- return;
- }
- /*
- * REPORT - Output the data for analysis of one file
- */
- void
- report( cum, locn, locations, name_in )
- long int cum[ 256 ], /* accumulator for each byte */
- locn[ 256 ][ 8 ]; /* offset of 8 occurrences of each */
- Bool locations ; /* TRUE if selected by user */
- char name_in[] ;
- {
- FILE *fp_out ;
- char fname[20];
- unsigned char c;
- double pct, /* % of occurrences */
- f_grand; /* grand total bytes */
- Bool foul_up ;
- int result,
- len, i, j ;
- long int grand_total,
- limit; /* up to 8 are tracked */
-
- if( locations )
- sprintf( fname, "%s.loc", name_in );
- else
- sprintf( fname, "%s.byt", name_in );
- len = strlen( fname ) ;
- for( i = 0 ; i < len ; i++ )
- {
- if( fname[i] == '.' )
- {
- if( i != len - 3 )
- {
- if( locations )
- strncpy( &fname[i+1], "loc", 3 );
- else
- strncpy( &fname[i+1], "byt", 3 );
- }
- fname[i+4] = '\0' ;
- break ;
- }
- }
-
- non_exist( fname ) ;
-
- if(( fp_out = fopen( fname, "w" )) == NULL )
- {
- fprintf( stderr, "Can't open file %s\n", fname );
- return ;
- }
-
- grand_total = 0;
- foul_up = FALSE ;
- for( i = 0; i < 256 ; i++ )
- grand_total += cum[ i ];
- f_grand = ( double ) grand_total ;
-
- for( i = 0; i < 256 ; i++ )
- {
- if( cum[ i ] )
- {
- c = ( unsigned char ) i;
- pct = 100.0 * ( ( double ) cum[ i ] / f_grand ) ;
-
- /* For Unix version, next line should read
- if( i < 0x21 || i > 0x7e ) */
-
- if( i < 0x21 || i == 0x7f || i == 0xff )
- {
- if( !fprintf( fp_out, "\\%03o [%02X]%7ld %4.1f%%",
- i, i, cum[ i ], pct ) )
- foul_up = TRUE ;
- }
- else
- {
- if( !fprintf( fp_out, "%c [%02X]%7ld %4.1f%%",
- c, i, cum[ i ], pct ))
- foul_up = TRUE ;
- }
- if( locations )
- {
- fputc( ' ', fp_out );
- fputc( ' ', fp_out );
- limit = 8;
- if( cum[i] < 8 )
- limit = cum[i] ;
- for( j = 0 ; j < limit ; j++ )
- fprintf( fp_out, " %ld", locn[ i ][ j ] );
- }
- fputc( '\n', fp_out );
- }
- }
-
- if( foul_up )
- fprintf( stderr, "Unable to write report in file %s\n", fname );
- else
- fprintf( stderr,
- "\n\nInput size = %ld bytes. Results are in file %s\n\n",
- grand_total, fname );
- if( fclose( fp_out ))
- fprintf( stderr, "Problem closing %s\n", fname );
-
- return ;
- }
- /*
- * NON_EXIST Test the existence of a file; if it exists,
- * substitute digits successively for the last
- * byte in the name until a non-existent file
- * is named, or until the last digit is '9'; do
- * the same with the second last byte, for 100
- * possible combinations
- */
- void
- non_exist( fname )
- char fname[] ;
- {
- struct stat buf;
- Bool gotcha ;
- int result,
- decade, /* batch of 10 names */
- len, i ;
-
- result = stat( fname, &buf );
- if( !result ) /* data obtained = that file exists */
- {
- len = strlen( fname ) ;
-
- for( decade = 0 ; decade < 10 ; decade++ )
- {
- gotcha = FALSE ;
- if( decade )
- {
- if( fname[len-2] == '.' )
- break ; /* Don't mess with one digit
- name extension */
- fname[ len - 2 ] = '0' + decade ;
- }
- fname[ len - 1 ] = '0' ;
-
- /* Try names ending in 0 through 9 */
-
- for( i = 0 ; i < 10 ; i++ )
- {
- result = stat( fname, &buf );
- if( result )
- {
- gotcha = TRUE ;
- break ;
- }
- if( i < 9 )
- fname[ len - 1 ] += 1 ;
- }
- if( gotcha )
- break ;
- }
-
- }
- return ;
- }