Power-Programmierung

home *** CD-ROM | disk | FTP | other *** search

/ Power-Programmierung / CD2.mdf / doc / mir / a_bytes.c < prev next >

Wrap

C/C++ Source or Header | 1992-07-02 | 11.9 KB | 367 lines

/* * usage: a_bytes [ /L ] file_name[s] * * A_BYTES Analyze the bytes (characters) used within any file, report * the frequency of each byte present. If the location flag /L * is set, include offsets of the first 8 occurrences of each * byte pattern present. * * input: Any file[s] whatsoever. * * output: file_name.BYT which contains up to 256 lines, one line for * each different byte present. The byte is shown first in * printable OR octal form, then the hexadecimal equivalent. * The third column is frequency. The fourth column shows * percentage of total occurrences within the file. * * If the /L locations option is selected, the output file is * name file_name.LOC and the offsets of the first up to 8 * occurrences follow at the end of each line. * * writeup: MIR TUTORIAL ONE, topic 5 * Compiled with STACK = 16000 * * written: Douglas Lowry Jan 04 92 * modified: Douglas Lowry Feb 15 92 * Mar 20 92 Ten alternative report names * Copyright (C) 1992 Marpex Inc. * * The MIR (Mass Indexing and Retrieval) Tutorials explain detailed * usage and co-ordination of the MIR family of programs to analyze, * prepare and index databases (small through gigabyte size), and * how to build integrated retrieval software around the MIR search * engine. The fifth of the five MIR tutorial series explains how * to extend indexing capability into leading edge search-related * technologies. For more information, GO IBMPRO on CompuServe; * MIR files are in the DBMS library. The same files are on the * Canada Remote Systems BBS. A diskette copy of the Introduction * is available by mail ($10 US... check, Visa or Mastercard); * diskettes with Introduction, Tutorial ONE software and the * shareware Tutorial ONE text cost $29. Shareware registration * for a tutorial is also $29. * * E-mail... * Compuserve 71431,1337 * Internet doug.lowry%canrem.com * UUCP canrem!doug.lowry * Others: doug.lowry@canrem.uucp * * FAX... 416 963-5677 * * "Snail mail"... Douglas Lowry, Ph.D. * Marpex Inc. * 5334 Yonge Street, #1102 * North York, Ontario * Canada M2N 6M2 * * Related database consultation and preparation services are * available through: * Innotech Inc., 2001 Sheppard Avenue E., Suite #118, * North York, Ontario Canada M2J 4Z7 * Tel. 416 492-3838 FAX 416 492-3843 * * This program is free software; you may redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of * the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * (file 05LICENS) along with this program; if not, write to the * Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, * USA. */ #include <stdio.h> #include <fcntl.h> #include <sys\types.h> #include <sys\stat.h> #include <io.h> #define repeat for(;;) typedef enum bool { FALSE = 0, TRUE = 1 } Bool ; #define INTAKE 2048 /* # of bytes in input buffer */ void process(), Usage_(), report(), non_exist() ; char *Cmdname_() { return( "a_bytes" ) ; } /* * MAIN - */ main( argc, argv ) int argc; char **argv; { Bool foul_up, locations ; /* requested by user */ char c10; /* argv[1][0] */ int fd, /* file descriptor */ file, bgn_at ; long int cum[ 256 ], /* accumulator for each byte */ locn[ 256 ][ 8 ]; /* offset of 8 occurrences of each */ locations = FALSE ; if( argv[1][1] == 'l' ) argv[1][1] = 'L' ; c10 = argv[1][0] ; if(( c10 == '-' || c10 == '/' ) && argv[1][1] == 'L' ) locations = TRUE ; else { if( argc == 1 || c10 == '-' || c10 == '/' || c10 == '?' ) Usage_(); } bgn_at = 1 ; if( locations ) bgn_at = 2 ; for( file = bgn_at ; file < argc ; file++ ) { if(( fd = open( argv[ file ], O_RDONLY | O_BINARY )) == -1 ) { fprintf( stderr, "Can't open file %s\n", argv[ file ] ); exit( 1 ); } process( fd, cum, locn, locations ); report( cum, locn, locations, argv[file] ) ; if( close( fd )) fprintf( stderr, "Problem closing %s\n", argv[ file ] ); } exit( 0 ); } void Usage_() { fprintf( stderr, "\nusage: %s [ /L ] file_name[s]\n\n\ Analyze the bytes (characters) used within any file, report\n\ the frequency of each byte present. If the location flag /L\n\ is set, include offsets of the first 8 occurrences of each\n", Cmdname_() ); fprintf( stderr, " byte pattern present.\n\n\ input: Any file[s] whatsoever.\n\n\ output: file_name.BYT which contains up to 256 lines, one line for\n\ each different byte present. The byte is shown first in\n\ printable OR octal form, then the hexadecimal equivalent.\n" ); fprintf( stderr, " The third column is frequency. The fourth column shows\n\ percentage of total occurrences within the file.\n\n\ If the /L locations option is selected, the output file is\n\ name file_name.LOC and the offsets of the first up to 8\n\ occurrences follow at the end of each line.\n\n" ) ; fprintf( stderr, "writeup: MIR TUTORIAL ONE, topic 5\n\n" ); exit( 1 ) ; } /* * PROCESS */ void process( fd, cum, locn, locations ) int fd; /* file descriptor */ long int cum[ 256 ], /* accumulator for each byte */ locn[ 256 ][ 8 ]; /* offset of 8 occurrences of each */ Bool locations ; /* TRUE if selected by user */ { unsigned char buf_in[ INTAKE ]; register int buf_len, i; long int gross_locn, /* count of cumulative intakes */ fine_locn, ct ; for( i = 0; i < 256 ; i++ ) cum[ i ] = 0; gross_locn = 0 ; repeat { fine_locn = 0 ; if ( ( buf_len = read( fd, buf_in, INTAKE ) ) == 0 ) break; for ( i= 0 ; i < buf_len ; i++ ) { cum[ buf_in[ i ] ]++ ; if( locations && cum[ buf_in[ i ] ] < 9 ) { ct = cum[ buf_in[ i ] ] - 1 ; locn[ buf_in [ i ] ][ ct ] = gross_locn + fine_locn ; } fine_locn++ ; } gross_locn += buf_len ; } return; } /* * REPORT - Output the data for analysis of one file */ void report( cum, locn, locations, name_in ) long int cum[ 256 ], /* accumulator for each byte */ locn[ 256 ][ 8 ]; /* offset of 8 occurrences of each */ Bool locations ; /* TRUE if selected by user */ char name_in[] ; { FILE *fp_out ; char fname[20]; unsigned char c; double pct, /* % of occurrences */ f_grand; /* grand total bytes */ Bool foul_up ; int result, len, i, j ; long int grand_total, limit; /* up to 8 are tracked */ if( locations ) sprintf( fname, "%s.loc", name_in ); else sprintf( fname, "%s.byt", name_in ); len = strlen( fname ) ; for( i = 0 ; i < len ; i++ ) { if( fname[i] == '.' ) { if( i != len - 3 ) { if( locations ) strncpy( &fname[i+1], "loc", 3 ); else strncpy( &fname[i+1], "byt", 3 ); } fname[i+4] = '\0' ; break ; } } non_exist( fname ) ; if(( fp_out = fopen( fname, "w" )) == NULL ) { fprintf( stderr, "Can't open file %s\n", fname ); return ; } grand_total = 0; foul_up = FALSE ; for( i = 0; i < 256 ; i++ ) grand_total += cum[ i ]; f_grand = ( double ) grand_total ; for( i = 0; i < 256 ; i++ ) { if( cum[ i ] ) { c = ( unsigned char ) i; pct = 100.0 * ( ( double ) cum[ i ] / f_grand ) ; /* For Unix version, next line should read if( i < 0x21 || i > 0x7e ) */ if( i < 0x21 || i == 0x7f || i == 0xff ) { if( !fprintf( fp_out, "\\%03o [%02X]%7ld %4.1f%%", i, i, cum[ i ], pct ) ) foul_up = TRUE ; } else { if( !fprintf( fp_out, "%c [%02X]%7ld %4.1f%%", c, i, cum[ i ], pct )) foul_up = TRUE ; } if( locations ) { fputc( ' ', fp_out ); fputc( ' ', fp_out ); limit = 8; if( cum[i] < 8 ) limit = cum[i] ; for( j = 0 ; j < limit ; j++ ) fprintf( fp_out, " %ld", locn[ i ][ j ] ); } fputc( '\n', fp_out ); } } if( foul_up ) fprintf( stderr, "Unable to write report in file %s\n", fname ); else fprintf( stderr, "\n\nInput size = %ld bytes. Results are in file %s\n\n", grand_total, fname ); if( fclose( fp_out )) fprintf( stderr, "Problem closing %s\n", fname ); return ; } /* * NON_EXIST Test the existence of a file; if it exists, * substitute digits successively for the last * byte in the name until a non-existent file * is named, or until the last digit is '9'; do * the same with the second last byte, for 100 * possible combinations */ void non_exist( fname ) char fname[] ; { struct stat buf; Bool gotcha ; int result, decade, /* batch of 10 names */ len, i ; result = stat( fname, &buf ); if( !result ) /* data obtained = that file exists */ { len = strlen( fname ) ; for( decade = 0 ; decade < 10 ; decade++ ) { gotcha = FALSE ; if( decade ) { if( fname[len-2] == '.' ) break ; /* Don't mess with one digit name extension */ fname[ len - 2 ] = '0' + decade ; } fname[ len - 1 ] = '0' ; /* Try names ending in 0 through 9 */ for( i = 0 ; i < 10 ; i++ ) { result = stat( fname, &buf ); if( result ) { gotcha = TRUE ; break ; } if( i < 9 ) fname[ len - 1 ] += 1 ; } if( gotcha ) break ; } } return ; }