Power-Programmierung

home *** CD-ROM | disk | FTP | other *** search

/ Power-Programmierung / CD2.mdf / doc / mir / a_len.c < prev next >

Wrap

C/C++ Source or Header | 1992-07-02 | 12.0 KB | 377 lines

/* * usage: a_len [interval] file_name[s] * * A_LEN Analyze the distribution of line lengths up to 1024 bytes * within any file. The reporting interval (an integer from * 1 to 100) is a count of the lengths that will be grouped * together. For example, an interval of 10 means that * frequencies of length 0, length 1-10, length 11-20, etc. * are shown in the report. The default interval is 10. If * the first file name starts with numeric digits, show the * interval first! * * input: Any ASCII file[s]. * * output: file_name.len which reports the frequency of line lengths * occuring in the file. Lengths exclude carriage returns and * line feeds. * * writeup: MIR TUTORIAL ONE, topic 6 * * written: Douglas Lowry Feb 13 92 * modified: Douglas Lowry Feb 14 92 * Douglas Lowry Mar 20 92 Cycle thru report names * Copyright (C) 1992 Marpex Inc. * Compiled with STACK = 8000 * NOTE THE PROGRAM COMPILES MUCH SMALLER (10,000 INSTEAD OF 19,200 BYTES) * IF THE USE OF DOUBLE TYPE IS OMITTED IN THE "REPORT" FUNCTION. * * The MIR (Mass Indexing and Retrieval) Tutorials explain detailed * usage and co-ordination of the MIR family of programs to analyze, * prepare and index databases (small through gigabyte size), and * how to build integrated retrieval software around the MIR search * engine. The fifth of the five MIR tutorial series explains how * to extend indexing capability into leading edge search-related * technologies. For more information, GO IBMPRO on CompuServe; * MIR files are in the DBMS library. The same files are on the * Canada Remote Systems BBS. A diskette copy of the Introduction * is available by mail ($10 US... check, Visa or Mastercard); * diskettes with Introduction, Tutorial ONE software and the * shareware Tutorial ONE text cost $29. Shareware registration * for a tutorial is also $29. * * E-mail... * Compuserve 71431,1337 * Internet doug.lowry%canrem.com * UUCP canrem!doug.lowry * Others: doug.lowry@canrem.uucp * * FAX... 416 963-5677 * * "Snail mail"... Douglas Lowry, Ph.D. * Marpex Inc. * 5334 Yonge Street, #1102 * North York, Ontario * Canada M2N 6M2 * * Related database consultation and preparation services are * available through: * Innotech Inc., 2001 Sheppard Avenue E., Suite #118, * North York, Ontario Canada M2J 4Z7 * Tel. 416 492-3838 FAX 416 492-3843 * * This program is free software; you may redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of * the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * (file 05LICENS) along with this program; if not, write to the * Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, * USA. */ #include <stdio.h> #include <sys\types.h> #include <sys\stat.h> #define repeat for(;;) typedef enum bool { FALSE = 0, TRUE = 1 } Bool ; #define MAX_BYTES 2048 /* # of bytes in input buffer */ void process(), Usage_(), report(), non_exist() ; char *Cmdname_() { return( "a_len" ) ; } /* * MAIN - */ main( argc, argv ) int argc; char **argv; { FILE *fp, *fp_out; char fname[20], c10 ; int interval, /* argument 1 - 100 */ from, /* argument # of first file name*/ file, len, i ; long int cum[ 1026 ]; /* accumulator for each byte */ /* Those over 1024 are counted in [1025] */ c10 = argv[1][0] ; if( argc < 2 || c10 == '-' || c10 == '/' || c10 == '?' ) Usage_(); interval = 10 ; from = 1 ; i = atoi( argv[1] ) ; if( i > 0 && i < 101 ) { interval = i ; from = 2 ; } else if( i > 100 ) Usage_() ; for( file = from; file < argc ; file++ ) { if(( fp = fopen( argv[ file ], "r+b" )) == NULL ) { fprintf( stderr, "Can't open file %s\n", argv[ file ] ); exit( 1 ); } process( fp, cum ); if( fclose( fp )) fprintf( stderr, "Problem closing %s\n", argv[ file ] ); sprintf( fname, "%s.len", argv[ file ] ); len = strlen( fname ) ; for( i = 0 ; i < len ; i++ ) { if( fname[i] == '.' ) { if( i != len - 3 ) strncpy( &fname[i+1], "len", 3 ); fname[i+4] = '\0' ; break ; } } non_exist( fname ) ; if(( fp_out = fopen( fname, "w" )) == NULL ) { fprintf( stderr, "Can't open file %s\n", fname ); exit( 1 ); } report( fp_out, cum, interval, fname ) ; if( fclose( fp_out )) fprintf( stderr, "Problem closing %s\n", fname ); } exit( 0 ); } void Usage_() { fprintf( stderr, "\nusage: %s [interval] file_name[s]\n\n\ Analyze the distribution of line lengths up to 1024 bytes\n\ within any file. The reporting interval (an integer from\n\ 1 to 100) is a count of the lengths that will be grouped\n", Cmdname_() ); fprintf( stderr, " together. For example, an interval of 10 means that\n\ frequencies of length 0, length 1-10, length 11-20, etc.\n\ are shown in the report. The default interval is 10. If\n\ the first file name starts with numeric digits, show the\n" ); fprintf( stderr, " interval first!\n\n\ input: Any ASCII file[s].\n\n\ output: file_name.len which reports the frequency of line lengths\n\ occuring in the file. Lengths exclude carriage returns and\n\ line feeds.\n\n\ writeup: MIR TUTORIAL ONE, topic 6\n\n" ) ; exit( 1 ) ; } /* * PROCESS */ void process( fp, cum ) FILE *fp ; long int cum[ 1026 ]; /* accumulator for each length */ { char line_in[ MAX_BYTES ] ; int len, i ; Bool after_nl; /* after a new line */ for( i = 0 ; i < 1026 ; i++ ) cum[ i ] = 0 ; while( fgets( line_in, MAX_BYTES, fp ) != NULL ) { len = strlen( line_in ); while( line_in[ len-1 ] == '\n' || line_in[ len-1 ] == '\015' ) len-- ; if( len > MAX_BYTES - 4 ) { fprintf( stderr, "FATAL... Line exceeds %d bytes.\n\n", len ); exit( 1 ); } if( len > 1024 ) cum[ 1025 ]++ ; else cum[ len ]++ ; } return; } /* * REPORT - Write the analysis */ void report( fp_out, cum, interval, fname ) FILE *fp_out ; long int cum[ 1026 ]; /* accumulator for each byte */ /* Those over 1024 are counted in [1025] */ int interval; /* argument 1 - 100 */ char fname[20]; { Bool foul_up ; double pct, d_lines ; /* count of lines */ long int group_cum, /* frequency across interval */ lines ; int pt, i, low, high; /* ends of interval */ foul_up = FALSE ; lines = 0 ; for( i = 0 ; i < 1026 ; i++ ) lines += cum[ i ] ; d_lines = ( double ) lines ; if( interval == 1 ) { if( cum[0] ) { pct = 100.0 * (( double ) cum[0] / d_lines ) ; fprintf( fp_out, " 0:%7ld %4.1f%%\n", cum[0], pct ) ; } for( pt = 1 ; pt < 1025 ; pt++ ) { if( cum[ pt ] ) { pct = 100.0 * (( double ) cum[ pt ] / d_lines ) ; if( !fprintf( fp_out, " %4d:%7ld %4.1f%%\n", pt, cum[pt], pct )) { foul_up = TRUE ; break ; } } } if( cum[1025] ) { pct = 100.0 * (( double ) cum[ 1025 ] / d_lines ) ; fprintf( fp_out, "1025+:%7ld %4.1f%%\n", cum[1025] ) ; } } else /* wider interval */ { if( cum[0] ) { pct = 100.0 * (( double ) cum[0] / d_lines ) ; fprintf( fp_out, " 0:%7ld %4.1f%%\n", cum[0], pct ) ; } for( pt = 1 ; pt < 1025 ; pt++ ) { low = pt ; group_cum = 0 ; for( i = 0 ; i < interval ; i++ ) { group_cum += cum[ pt++ ] ; if( pt > 1024 ) break ; } high = pt - 1 ; pt = high ; if( group_cum ) { pct = 100.0 * (( double ) group_cum / d_lines ) ; if( !fprintf( fp_out, "%4d -%4d:%7ld %4.1f%%\n", low, high, group_cum, pct )) foul_up = TRUE ; } if( foul_up ) break ; } if( group_cum ) { pct = 100.0 * (( double ) group_cum / d_lines ) ; fprintf( fp_out, "%4d -1024:%7ld %4.1f%%\n", group_cum, pct ); } if( cum[1025] ) { pct = 100.0 * (( double ) cum[1025] / d_lines ) ; fprintf( fp_out, " Over 1024:%7ld %4.1f%%\n", cum[1025],pct ); } } if( foul_up ) fprintf( stderr, "Unable to write report in file %s\n", fname ); else fprintf( stderr, "\n%ld lines reported in file %s\n\n", lines, fname ); return ; } /* * NON_EXIST Test the existence of a file; if it exists, * substitute digits successively for the last * byte in the name until a non-existent file * is named, or until the last digit is '9'; do * the same with the second last byte, for 100 * possible combinations */ void non_exist( fname ) char fname[] ; { struct stat buf; Bool gotcha ; int result, decade, /* batch of 10 names */ len, i ; result = stat( fname, &buf ); if( !result ) /* data obtained = that file exists */ { len = strlen( fname ) ; for( decade = 0 ; decade < 10 ; decade++ ) { gotcha = FALSE ; if( decade ) { if( fname[len-2] == '.' ) break ; /* Don't mess with one digit name extension */ fname[ len - 2 ] = '0' + decade ; } fname[ len - 1 ] = '0' ; /* Try names ending in 0 through 9 */ for( i = 0 ; i < 10 ; i++ ) { result = stat( fname, &buf ); if( result ) { gotcha = TRUE ; break ; } if( i < 9 ) fname[ len - 1 ] += 1 ; } if( gotcha ) break ; } } return ; }