Power-Programmierung

home *** CD-ROM | disk | FTP | other *** search

/ Power-Programmierung / CD2.mdf / doc / mir / f_print.c < prev next >

Wrap

C/C++ Source or Header | 1992-07-02 | 13.2 KB | 383 lines

/* * usage: f_print file_name [/a][/w] [ from_byte to_byte ] > subset * * F_PRINT Reduces a file to printable characters only. If the /w * option is specified, strings of printable characters that * are unlikely to be words are filtered out as well, and * each new burst of accepted text is placed on a new line. * /a causes accented characters to be accepted as printable. * * input: Any file whatsoever, or any part of a file. * * output: Printable subset. * * writeup: MIR TUTORIAL ONE, topic 5 * * Written: Douglas Lowry Jan 07 92 * Modified: Douglas Lowry Feb 27 92 * Copyright (C) 1992 Marpex Inc. * * The MIR (Mass Indexing and Retrieval) Tutorials explain detailed * usage and co-ordination of the MIR family of programs to analyze, * prepare and index databases (small through gigabyte size), and * how to build integrated retrieval software around the MIR search * engine. The fifth of the five MIR tutorial series explains how * to extend indexing capability into leading edge search-related * technologies. For more information, GO IBMPRO on CompuServe; * MIR files are in the DBMS library. The same files are on the * Canada Remote Systems BBS. A diskette copy of the Introduction * is available by mail ($10 US... check, Visa or Mastercard); * diskettes with Introduction, Tutorial ONE software and the * shareware Tutorial ONE text cost $29. Shareware registration * for a tutorial is also $29. * * E-mail... * Compuserve 71431,1337 * Internet doug.lowry%canrem.com * UUCP canrem!doug.lowry * Others: doug.lowry@canrem.uucp * * FAX... 416 963-5677 * * "Snail mail"... Douglas Lowry, Ph.D. * Marpex Inc. * 5334 Yonge Street, #1102 * North York, Ontario * Canada M2N 6M2 * * Related database consultation and preparation services are * available through: * Innotech Inc., 2001 Sheppard Avenue E., Suite #118, * North York, Ontario Canada M2J 4Z7 * Tel. 416 492-3838 FAX 416 492-3843 * * This program is free software; you may redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of * the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * (file 05LICENS) along with this program; if not, write to the * Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, * USA. */ #include <stdio.h> #include <stdlib.h> #include <ctype.h> #define BIGBUF 2048 #define STORE 32 #define NON_PRINT 0 #define WHITE_SPACE 1 #define PUNCTUATION 2 #define DIGIT 3 #define CONSONANT 4 #define VOWEL 5 #define HI_CONSONANT 6 #define HI_VOWEL 7 #define TYPE_CT 8 /* count of above types */ #define repeat for(;;) /* * declarations */ typedef enum _bool { FALSE = 0, TRUE = 1 } Bool; void Usage_(), process(), clear_store() ; Bool check_store() ; char *Cmdname_() { return( "f_print" ); } /* * GLOBAL VARIABLES */ static unsigned char table[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, /* ctls */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* ctls */ /* bl ! " # $ % & ' ( ) * + , - . / */ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 2, /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, /* @ A B C D E F G H I J K L M N O */ 4, 5, 4, 4, 4, 5, 4, 4, 4, 5, 4, 4, 4, 4, 4, 5, /* P Q R S T U V W X Y Z [ \ ] ^ _ */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 2, 2, 2, 2, 2, /* ` a b c d e f g h i j k l m n o */ 2, 5, 4, 4, 4, 5, 4, 4, 4, 5, 4, 4, 4, 4, 4, 5, /* p q r s t u v w x y z { | } ~ NULL */ 4, 4, 4, 4, 4, 5, 4, 4, 4, 5, 4, 2, 2, 2, 2, 0, /* Ç ü é â ä à å ç ê ë è ï î ì Ä Å */ 6, 7, 7, 7, 7, 7, 7, 6, 7, 7, 7, 7, 7, 7, 7, 7, /* É æ Æ ô ö ò û ù ÿ Ö Ü ¢ £ ¥ ₧ ƒ */ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, /* á í ó ú ñ Ñ ª º ¿ ⌐ ¬ ½ ¼ ¡ « » */ 7, 7, 7, 7, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; /* * MAIN */ main( argc, argv ) int argc; char **argv; { FILE *fp ; char c10 ; Bool words, accent_ok, /* allow accented chars */ got_from ; /* found a "from_byte" argument */ int i ; long fr_byte, to_byte; /* byte range */ /* usage: f_print file_name [/a][/w] [ from_byte to_byte ] */ c10 = argv[1][0] ; if( argc < 2 || argc > 6 || c10 == '-' || c10 == '/' || c10 == '?' ) Usage_() ; if(( fp = fopen( argv[1], "rb" )) == NULL ) { fprintf( stderr, "\nUnable to open file %s.\n", argv[1] ); Usage_(); } words = got_from = accent_ok = FALSE ; fr_byte = 0 ; to_byte = 0x0fffffff ; for( i = 2 ; i < argc ; i++ ) { if( islower( argv[i][1] )) argv[i][1] = toupper( argv[i][1] ) ; c10 = argv[i][0] ; if(( c10 == '-' || c10 == '/' ) && argv[i][1] == 'W' ) words = TRUE ; else if(( c10 == '-' || c10 == '/' ) && argv[i][1] == 'A' ) accent_ok = TRUE ; else if( got_from ) to_byte = atol( argv[i] ) ; else { fr_byte = atol( argv[i] ); got_from = TRUE ; } } if( fr_byte ) { if( fseek( fp, fr_byte, SEEK_SET )) { fprintf( stderr, "Unable to position %s to %ld\n", argv[1], fr_byte ); Usage_() ; } } process( fp, fr_byte, to_byte, accent_ok, words ) ; fclose( fp ); exit( 0 ); } /* * Usage */ void Usage_() { fprintf( stderr, "\nUsage: %s file_name [/a][/w] [ from_byte to_byte ] > subset\n\n\ Reduces a file to printable characters only. If the /w\n\ option is specified, strings of printable characters that\n\ are unlikely to be words are filtered out as well, and\n", Cmdname_() ) ; fprintf( stderr, " each new burst of accepted text is placed on a new line.\n\ /a causes accented characters to be accepted as printable.\n\n\ input: Any file whatsoever, or any part of a file.\n\n\ output: Printable subset.\n\n\ writeup: MIR TUTORIAL ONE, topic 5\n\n" ) ; exit( 1 ) ; } /* * PROCESS - Passes through file from starting position, * filtering out unprintable material. */ void process( fp, fr_byte, to_byte, accent_ok, words ) FILE *fp ; long int fr_byte, /* beginning offset */ to_byte ; /* ending offset */ Bool accent_ok, words ; { /* The technique implemented below tests only the first STORE * characters of a printable sequence. Once this limit is reached, * we assume full printability until a NON_PRINT character is found. * The function check_store controls the criteria for whether the * start of a printable sequence passes. */ unsigned char buffer[ BIGBUF ], store[ STORE ], uc ; long int offset, /* cumulative bytes into file */ up_to ; /* test one beyond "to_byte" */ int buflen, /* of buffer contents */ in_store, /* consecutive bytes in store */ stor_typ[TYPE_CT], /* count each type in store */ type, /* of character per table above */ prev_type, /* previous type */ i, j, pt ; offset = fr_byte ; up_to = to_byte + 1 ; prev_type = type = NON_PRINT ; clear_store( &in_store, stor_typ ) ; repeat { if( offset++ > up_to ) break ; buflen = fread( buffer, sizeof( char ), BIGBUF, fp ); if( !buflen ) break ; for( pt = 0 ; pt < buflen ; pt ++ ) { offset++ ; if( offset > up_to ) break ; uc = buffer[ pt ] ; prev_type = type ; type = table[ uc ] ; if( !accent_ok && ( type == HI_CONSONANT || type == HI_VOWEL )) type = NON_PRINT ; /* Two accented characters in sequence are not printable */ if(( prev_type == HI_CONSONANT || prev_type == HI_VOWEL ) && ( type == HI_CONSONANT || type == HI_VOWEL )) { type = NON_PRINT ; in_store-- ; } if( type == NON_PRINT ) { if( words && in_store == STORE ) putchar( '\n' ); else if( words && in_store && check_store( in_store, stor_typ )) { for( i = 0 ; i < in_store ; i++ ) { if( putchar( store[i] ) != store[i] ) { fprintf( stderr, "Unable to write... FATAL.\n\n" ); exit( 1 ); } } putchar( '\n' ); } if( in_store ) clear_store( &in_store, stor_typ ) ; continue ; } /* printable characters - output or add to store */ if( !words || in_store == STORE ) { if( putchar( uc ) != uc ) { fprintf( stderr, "Unable to write... FATAL.\n\n" ); exit( 1 ); } } else { store[ in_store++ ] = uc ; stor_typ[ type ]++ ; if( in_store == STORE ) { if( !check_store( in_store, stor_typ )) clear_store( &in_store, stor_typ ) ; else { for( i = 0 ; i < in_store ; i++ ) { if( putchar( store[i] ) != store[i] ) { fprintf( stderr, "Unable to write... FATAL.\n\n" ); exit( 1 ); } } } } } } } return; } /* * CLEAR_STORE */ void clear_store( in_store, stor_typ ) int *in_store, /* consecutive bytes in store */ stor_typ[ TYPE_CT ]; /* count each type in store */ { int i ; *in_store = 0 ; for( i = 1 ; i < TYPE_CT ; i++ ) stor_typ[ i ] = 0 ; return ; } /* * CHECK_STORE - Is the series held in "store" valid "words"? * * In the version that follows, a series passes if it contains 5 or * more bytes made up of: * 1. digits with NO vowels AND NO consonants * 2. vowels AND consonants (with or without digits) * * You may wish to try alternative forms of this function. Its objective * is to maximize retention of desired text while minimizing retention of * junk. Proximity might be considered... more than 4 consonants in a * row, no vowels between white spaces, etc. */ Bool check_store( in_store, typ ) int in_store, typ[ TYPE_CT ]; /* count each type in store */ { if( in_store < 5 ) return( FALSE ); typ[ CONSONANT ] += typ[ HI_CONSONANT ] ; typ[ VOWEL ] += typ[ HI_VOWEL ]; if( typ[ DIGIT ] && !typ[ CONSONANT ] && !typ[ VOWEL ] ) return( TRUE ) ; if( typ[ CONSONANT ] && typ[ VOWEL ] ) return( TRUE ) ; return( FALSE ) ; }