home *** CD-ROM | disk | FTP | other *** search
- /*
- * usage: f_print file_name [/a][/w] [ from_byte to_byte ] > subset
- *
- * F_PRINT Reduces a file to printable characters only. If the /w
- * option is specified, strings of printable characters that
- * are unlikely to be words are filtered out as well, and
- * each new burst of accepted text is placed on a new line.
- * /a causes accented characters to be accepted as printable.
- *
- * input: Any file whatsoever, or any part of a file.
- *
- * output: Printable subset.
- *
- * writeup: MIR TUTORIAL ONE, topic 5
- *
- * Written: Douglas Lowry Jan 07 92
- * Modified: Douglas Lowry Feb 27 92
- * Copyright (C) 1992 Marpex Inc.
- *
- * The MIR (Mass Indexing and Retrieval) Tutorials explain detailed
- * usage and co-ordination of the MIR family of programs to analyze,
- * prepare and index databases (small through gigabyte size), and
- * how to build integrated retrieval software around the MIR search
- * engine. The fifth of the five MIR tutorial series explains how
- * to extend indexing capability into leading edge search-related
- * technologies. For more information, GO IBMPRO on CompuServe;
- * MIR files are in the DBMS library. The same files are on the
- * Canada Remote Systems BBS. A diskette copy of the Introduction
- * is available by mail ($10 US... check, Visa or Mastercard);
- * diskettes with Introduction, Tutorial ONE software and the
- * shareware Tutorial ONE text cost $29. Shareware registration
- * for a tutorial is also $29.
- *
- * E-mail...
- * Compuserve 71431,1337
- * Internet doug.lowry%canrem.com
- * UUCP canrem!doug.lowry
- * Others: doug.lowry@canrem.uucp
- *
- * FAX... 416 963-5677
- *
- * "Snail mail"... Douglas Lowry, Ph.D.
- * Marpex Inc.
- * 5334 Yonge Street, #1102
- * North York, Ontario
- * Canada M2N 6M2
- *
- * Related database consultation and preparation services are
- * available through:
- * Innotech Inc., 2001 Sheppard Avenue E., Suite #118,
- * North York, Ontario Canada M2J 4Z7
- * Tel. 416 492-3838 FAX 416 492-3843
- *
- * This program is free software; you may redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * (file 05LICENS) along with this program; if not, write to the
- * Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
- * USA.
- */
-
- #include <stdio.h>
- #include <stdlib.h>
- #include <ctype.h>
-
- #define BIGBUF 2048
- #define STORE 32
-
- #define NON_PRINT 0
- #define WHITE_SPACE 1
- #define PUNCTUATION 2
- #define DIGIT 3
- #define CONSONANT 4
- #define VOWEL 5
- #define HI_CONSONANT 6
- #define HI_VOWEL 7
- #define TYPE_CT 8 /* count of above types */
-
- #define repeat for(;;)
-
- /*
- * declarations
- */
-
- typedef enum _bool
- { FALSE = 0, TRUE = 1 } Bool;
-
- void Usage_(), process(), clear_store() ;
- Bool check_store() ;
- char *Cmdname_() { return( "f_print" ); }
-
- /*
- * GLOBAL VARIABLES
- */
-
- static unsigned char table[256] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, /* ctls */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* ctls */
- /* bl ! " # $ % & ' ( ) * + , - . / */
- 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 2,
- /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2,
- /* @ A B C D E F G H I J K L M N O */
- 4, 5, 4, 4, 4, 5, 4, 4, 4, 5, 4, 4, 4, 4, 4, 5,
- /* P Q R S T U V W X Y Z [ \ ] ^ _ */
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 2, 2, 2, 2, 2,
- /* ` a b c d e f g h i j k l m n o */
- 2, 5, 4, 4, 4, 5, 4, 4, 4, 5, 4, 4, 4, 4, 4, 5,
- /* p q r s t u v w x y z { | } ~ NULL */
- 4, 4, 4, 4, 4, 5, 4, 4, 4, 5, 4, 2, 2, 2, 2, 0,
- /* Ç ü é â ä à å ç ê ë è ï î ì Ä Å */
- 6, 7, 7, 7, 7, 7, 7, 6, 7, 7, 7, 7, 7, 7, 7, 7,
- /* É æ Æ ô ö ò û ù ÿ Ö Ü ¢ £ ¥ ₧ ƒ */
- 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0,
- /* á í ó ú ñ Ñ ª º ¿ ⌐ ¬ ½ ¼ ¡ « » */
- 7, 7, 7, 7, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- };
-
- /*
- * MAIN
- */
-
- main( argc, argv )
- int argc;
- char **argv;
- {
- FILE *fp ;
- char c10 ;
- Bool words,
- accent_ok, /* allow accented chars */
- got_from ; /* found a "from_byte" argument */
- int i ;
- long fr_byte, to_byte; /* byte range */
-
- /* usage: f_print file_name [/a][/w] [ from_byte to_byte ] */
-
- c10 = argv[1][0] ;
- if( argc < 2 || argc > 6 || c10 == '-' || c10 == '/' || c10 == '?' )
- Usage_() ;
-
- if(( fp = fopen( argv[1], "rb" )) == NULL )
- {
- fprintf( stderr, "\nUnable to open file %s.\n", argv[1] );
- Usage_();
- }
-
- words = got_from = accent_ok = FALSE ;
- fr_byte = 0 ;
- to_byte = 0x0fffffff ;
-
- for( i = 2 ; i < argc ; i++ )
- {
- if( islower( argv[i][1] ))
- argv[i][1] = toupper( argv[i][1] ) ;
- c10 = argv[i][0] ;
- if(( c10 == '-' || c10 == '/' ) && argv[i][1] == 'W' )
- words = TRUE ;
- else if(( c10 == '-' || c10 == '/' ) && argv[i][1] == 'A' )
- accent_ok = TRUE ;
- else if( got_from )
- to_byte = atol( argv[i] ) ;
- else
- {
- fr_byte = atol( argv[i] );
- got_from = TRUE ;
- }
- }
-
- if( fr_byte )
- {
- if( fseek( fp, fr_byte, SEEK_SET ))
- {
- fprintf( stderr, "Unable to position %s to %ld\n",
- argv[1], fr_byte );
- Usage_() ;
- }
- }
-
- process( fp, fr_byte, to_byte, accent_ok, words ) ;
-
- fclose( fp );
- exit( 0 );
- }
- /*
- * Usage
- */
- void
- Usage_()
- {
- fprintf( stderr,
- "\nUsage: %s file_name [/a][/w] [ from_byte to_byte ] > subset\n\n\
- Reduces a file to printable characters only. If the /w\n\
- option is specified, strings of printable characters that\n\
- are unlikely to be words are filtered out as well, and\n",
- Cmdname_() ) ;
- fprintf( stderr,
- " each new burst of accepted text is placed on a new line.\n\
- /a causes accented characters to be accepted as printable.\n\n\
- input: Any file whatsoever, or any part of a file.\n\n\
- output: Printable subset.\n\n\
- writeup: MIR TUTORIAL ONE, topic 5\n\n" ) ;
- exit( 1 ) ;
- }
- /*
- * PROCESS - Passes through file from starting position,
- * filtering out unprintable material.
- */
- void
- process( fp, fr_byte, to_byte, accent_ok, words )
- FILE *fp ;
- long int fr_byte, /* beginning offset */
- to_byte ; /* ending offset */
- Bool accent_ok,
- words ;
- {
- /* The technique implemented below tests only the first STORE
- * characters of a printable sequence. Once this limit is reached,
- * we assume full printability until a NON_PRINT character is found.
- * The function check_store controls the criteria for whether the
- * start of a printable sequence passes.
- */
- unsigned char buffer[ BIGBUF ],
- store[ STORE ],
- uc ;
- long int offset, /* cumulative bytes into file */
- up_to ; /* test one beyond "to_byte" */
- int buflen, /* of buffer contents */
- in_store, /* consecutive bytes in store */
- stor_typ[TYPE_CT], /* count each type in store */
- type, /* of character per table above */
- prev_type, /* previous type */
- i, j, pt ;
-
- offset = fr_byte ;
- up_to = to_byte + 1 ;
- prev_type = type = NON_PRINT ;
- clear_store( &in_store, stor_typ ) ;
-
- repeat
- {
- if( offset++ > up_to )
- break ;
- buflen = fread( buffer, sizeof( char ), BIGBUF, fp );
- if( !buflen )
- break ;
-
- for( pt = 0 ; pt < buflen ; pt ++ )
- {
- offset++ ;
- if( offset > up_to )
- break ;
- uc = buffer[ pt ] ;
- prev_type = type ;
- type = table[ uc ] ;
-
- if( !accent_ok && ( type == HI_CONSONANT || type == HI_VOWEL ))
- type = NON_PRINT ;
-
- /* Two accented characters in sequence are not printable */
-
- if(( prev_type == HI_CONSONANT || prev_type == HI_VOWEL )
- && ( type == HI_CONSONANT || type == HI_VOWEL ))
- {
- type = NON_PRINT ;
- in_store-- ;
- }
-
- if( type == NON_PRINT )
- {
- if( words && in_store == STORE )
- putchar( '\n' );
- else if( words && in_store &&
- check_store( in_store, stor_typ ))
- {
- for( i = 0 ; i < in_store ; i++ )
- {
- if( putchar( store[i] ) != store[i] )
- {
- fprintf( stderr,
- "Unable to write... FATAL.\n\n" );
- exit( 1 );
- }
- }
- putchar( '\n' );
- }
- if( in_store )
- clear_store( &in_store, stor_typ ) ;
- continue ;
- }
-
- /* printable characters - output or add to store */
-
- if( !words || in_store == STORE )
- {
- if( putchar( uc ) != uc )
- {
- fprintf( stderr, "Unable to write... FATAL.\n\n" );
- exit( 1 );
- }
- }
- else
- {
- store[ in_store++ ] = uc ;
- stor_typ[ type ]++ ;
- if( in_store == STORE )
- {
- if( !check_store( in_store, stor_typ ))
- clear_store( &in_store, stor_typ ) ;
- else
- {
- for( i = 0 ; i < in_store ; i++ )
- {
- if( putchar( store[i] ) != store[i] )
- {
- fprintf( stderr,
- "Unable to write... FATAL.\n\n" );
- exit( 1 );
- }
- }
- }
- }
- }
- }
- }
- return;
- }
- /*
- * CLEAR_STORE
- */
- void
- clear_store( in_store, stor_typ )
- int *in_store, /* consecutive bytes in store */
- stor_typ[ TYPE_CT ]; /* count each type in store */
- {
- int i ;
-
- *in_store = 0 ;
- for( i = 1 ; i < TYPE_CT ; i++ )
- stor_typ[ i ] = 0 ;
- return ;
- }
- /*
- * CHECK_STORE - Is the series held in "store" valid "words"?
- *
- * In the version that follows, a series passes if it contains 5 or
- * more bytes made up of:
- * 1. digits with NO vowels AND NO consonants
- * 2. vowels AND consonants (with or without digits)
- *
- * You may wish to try alternative forms of this function. Its objective
- * is to maximize retention of desired text while minimizing retention of
- * junk. Proximity might be considered... more than 4 consonants in a
- * row, no vowels between white spaces, etc.
- */
- Bool
- check_store( in_store, typ )
- int in_store,
- typ[ TYPE_CT ]; /* count each type in store */
- {
- if( in_store < 5 )
- return( FALSE );
- typ[ CONSONANT ] += typ[ HI_CONSONANT ] ;
- typ[ VOWEL ] += typ[ HI_VOWEL ];
- if( typ[ DIGIT ] && !typ[ CONSONANT ] && !typ[ VOWEL ] )
- return( TRUE ) ;
- if( typ[ CONSONANT ] && typ[ VOWEL ] )
- return( TRUE ) ;
- return( FALSE ) ;
- }