home *** CD-ROM | disk | FTP | other *** search
- /*
- * Usage - sort2 [/r] [/+n] from_file to_file key[s]
- *
- * sort2 Sorts large ASCII files using the memory-bound DOS SORT
- * routine in multiple passes. /r signifies reverse order.
- * /+n specifies a starting column, 1-999. A key is 1 to 3
- * characters, used as a dividing point. The program separates
- * the input file into a series of temporary files, depending on
- * the byte(s) at the starting column. For n dividing points,
- * the program makes n+1 temporary files, and reports the size
- * of each. If all are under 60k characters, they are sorted
- * and placed together in the output file. If a run fails, add
- * another dividing point mid-way in the range that fails (that
- * is, the file that is too big), and try again. NOTE: The DOS
- * SORT starts column count at 1, converts all lower to upper case!
- *
- * input: Line oriented printable ASCII text.
- *
- * output: Same file, sorted.
- *
- * writeup: MIR TUTORIAL ONE, topic 5
- *
- * Written: Douglas Lowry Mar 06 91
- * Modified: Douglas Lowry Feb 21 92
- * Copyright (C) 1992 Marpex Inc.
- *
- * The MIR (Mass Indexing and Retrieval) Tutorials explain detailed
- * usage and co-ordination of the MIR family of programs to analyze,
- * prepare and index databases (small through gigabyte size), and
- * how to build integrated retrieval software around the MIR search
- * engine. The fifth of the five MIR tutorial series explains how
- * to extend indexing capability into leading edge search-related
- * technologies. For more information, GO IBMPRO on CompuServe;
- * MIR files are in the DBMS library. The same files are on the
- * Canada Remote Systems BBS. A diskette copy of the Introduction
- * is available by mail ($10 US... check, Visa or Mastercard);
- * diskettes with Introduction, Tutorial ONE software and the
- * shareware Tutorial ONE text cost $29. Shareware registration
- * for a tutorial is also $29.
- *
- * E-mail...
- * Compuserve 71431,1337
- * Internet doug.lowry%canrem.com
- * UUCP canrem!doug.lowry
- * Others: doug.lowry@canrem.uucp
- *
- * FAX... 416 963-5677
- *
- * "Snail mail"... Douglas Lowry, Ph.D.
- * Marpex Inc.
- * 5334 Yonge Street, #1102
- * North York, Ontario
- * Canada M2N 6M2
- *
- * Related database consultation and preparation services are
- * available through:
- * Innotech Inc., 2001 Sheppard Avenue E., Suite #118,
- * North York, Ontario Canada M2J 4Z7
- * Tel. 416 492-3838 FAX 416 492-3843
- *
- * This program is free software; you may redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * (file 05LICENS) along with this program; if not, write to the
- * Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
- * USA.
- */
-
- #include <stdio.h>
- #include <stdlib.h>
- #include <dos.h>
- #include <ctype.h>
- #include <direct.h>
- #include <process.h>
- #include <errno.h>
-
- #define MAX_BYTES 1024
- #define MAX_DIV 25
-
- #define repeat for(;;)
-
- typedef enum _bool
- { FALSE = 0, TRUE = 1 } Bool;
- /*
- * declarations
- */
-
- void Usage_(), process();
- char *Cmdname_() { return( "sort2" ); }
-
- FILE *fp_in;
-
- /*
- * MAIN
- */
-
- main( argc, argv )
- int argc;
- char **argv;
- {
- unsigned char dividers[ MAX_DIV ][ 4 ],
- uc[4] ;
- Bool reverse; /* perform reverse sort */
- int used_args, /* arguments identified so far */
- div_ct, /* count of dividers */
- start_col, /* start sorting at column */
- test, /* evaluate a comparison */
- ar, i, j ;
-
- /* usage: sort2 [/r] [/+n] from_file to_file key[s] */
-
- if( argv[1][0] == '-' || argc < 4 )
- Usage_();
-
- start_col = 1 ;
- used_args = div_ct = 0 ;
- reverse = FALSE ;
-
- for( i = 1 ; i < 3 ; i++ )
- {
- if( argv[i][0] != '/' )
- break ;
- if( argv[i][1] == 'r' || argv[i][1] == 'R' )
- {
- used_args++ ;
- reverse = TRUE ;
- }
- else if( argv[i][1] == '+' )
- {
- used_args++ ;
- start_col = ( int ) atol( &argv[i][2] ) ;
- }
- else
- {
- fprintf( stderr, "\nUnrecognized argument with /\n\n" );
- Usage_();
- }
- }
-
- if(( fp_in = fopen( argv[ used_args + 1 ], "rb" )) == NULL )
- {
- fprintf( stderr, "FATAL... Unable to open file %s\n",
- argv[ used_args + 1 ] );
- Usage_();
- }
-
- unlink( argv[ used_args + 2 ] ); /* output file name */
-
- if( argc < used_args + 3 )
- Usage_();
-
- for( ar = used_args + 3 ; ar < argc ; ar++ )
- {
- if( strlen( argv[ ar ] ) > 3 )
- Usage_();
- for( i = 0 ; argv[ ar ][ i ] ; i++ )
- {
- uc[i] = argv[ar][i] ;
- if( islower( uc[i] ) )
- uc[i] = toupper( uc[i] );
- }
- uc[i] = 0 ;
-
- if( !div_ct )
- {
- strcpy( dividers[ 0 ], uc ) ;
- div_ct = 1 ;
- }
- else
- {
- /* insertion sort */
- for( i = 0 ; i < div_ct ; i++ )
- {
- test = strcmp( uc, dividers[ i ] ) ;
- if( !test )
- break ; /* a repetition */
- if( test < 0 )
- {
- for( j = div_ct ; j > i ; j-- )
- strcpy( dividers[ j - 1 ], dividers[ j ] );
- strcpy( dividers[ i ], uc ) ;
- div_ct++ ;
- break ;
- }
- }
- if( i == div_ct ) /* add to end */
- strcpy( dividers[ div_ct++ ], uc ) ;
- if( div_ct > MAX_DIV -1 )
- fprintf( stderr, "RECOMPILE... over %d dividers.\n",
- MAX_DIV - 1 );
- }
-
- }
-
- process( reverse, dividers, div_ct, start_col, argv[ used_args +2] );
-
- fclose( fp_in );
- exit( 0 );
- }
- /*
- * Usage
- */
- void
- Usage_()
- {
- fprintf( stderr,
- "usage: %s [/r] [/+n] from_file to_file key[s]\n\n\
- Sorts large ASCII files using the memory-bound DOS SORT\n\
- routine in multiple passes. /r signifies reverse order.\n\
- /+n specifies a starting column, 1-999. A key is 1 to 3\n",
- Cmdname_());
- fprintf( stderr,
- " characters, used as a dividing point. The program separates\n\
- the input file into a series of temporary files, depending on\n\
- the byte(s) at the starting column. For n dividing points,\n\
- the program makes n+1 temporary files, and reports the size\n" );
- fprintf( stderr,
- " of each. If all are under 60k characters, they are sorted\n\
- and placed together in the output file. If a run fails, add\n\
- another dividing point mid-way in the range that fails (that\n\
- is, the file that is too big), and try again. NOTE: The DOS\n" ) ;
- fprintf( stderr,
- " SORT starts column count at 1, converts all lower to upper case!\n\n\
- input: Line oriented printable ASCII text.\n\n\
- output: Same file, sorted.\n\n\
- writeup: MIR TUTORIAL ONE, topic 5\n\n" ) ;
- exit( 1 ) ;
- }
- /*
- * PROCESS
- */
- void
- process( reverse, dividers, div_ct, start_col, outnam )
- unsigned char dividers[ MAX_DIV ][ 4 ] ;
- Bool reverse; /* perform reverse sort */
- int div_ct, /* count of dividers */
- start_col; /* start sorting at column */
- char outnam[32]; /* name of output file */
- {
- FILE *fp_tmp, *fp_bat ;
- char fname[ 32 ];
- unsigned char buf[ MAX_BYTES ], uc[4],
- from[4], to[4] ;
- Bool too_big ; /* Won't be able to sort subset.*/
- long int tmp_size ; /* size of temporary file */
- int pass,
- errno,
- test_lo, test_hi,
- len, i, pt ;
-
- unlink ("sort2tmp.bat" );
- if(( fp_bat = fopen( "sort2tmp.bat", "w" )) == NULL )
- {
- fprintf( stderr, "FATAL... Unable to open sort2tmp.bat\n" );
- Usage_();
- }
-
- too_big = FALSE ;
-
- for( pass = 0 ; pass <= div_ct ; pass++ )
- {
- for( i = 0 ; i < 4 ; i++ )
- from[i] = to[i] = uc[i] = 0 ;
- if( pass )
- rewind( fp_in );
- if( reverse )
- {
- if( !pass )
- {
- strcpy( from, dividers[ div_ct - 1 ] ) ;
- to[0] = 255 ; /* max char value */
- }
- else if( pass == div_ct )
- strcpy( to, dividers[0] ) ;
- else
- {
- strcpy( from, dividers[ div_ct - pass - 1 ] ) ;
- strcpy( to, dividers[ div_ct - pass ] ) ;
- }
- }
- else /* forward sort */
- {
- if( pass )
- strcpy( from, dividers[ pass - 1 ] ) ;
- if( pass < div_ct )
- strcpy( to, dividers[ pass ] ) ;
- else
- to[0] = 255 ;
- }
-
- sprintf( fname, "sort%02d.tmp", pass );
- if(( fp_tmp = fopen( fname, "wb" )) == NULL )
- {
- fprintf( stderr,
- "FATAL... Unable to open %s\n", fname );
- Usage_();
- }
-
- tmp_size = 0 ;
- while( fgets( buf, MAX_BYTES, fp_in ) != NULL )
- {
- len = strlen( buf ) ;
- if( len < start_col )
- {
- for( i = 0 ; i < 3 ; i++ )
- uc[i] = 0 ;
- }
- else
- {
- for( i = 0 ; i < 3 ; i++ )
- {
- uc[i] = buf[ start_col - 1 + i ];
- if( islower( uc[i] ))
- uc[i] = toupper( uc[i] );
- }
- }
- test_lo = strcmp( uc, from ) ;
- test_hi = strcmp( uc, to ) ;
- if( test_lo < 0 || test_hi >= 0 )
- continue ;
- fputs( buf, fp_tmp );
- tmp_size += len ;
- }
-
- if(( !reverse && pass < div_ct ) || ( reverse && pass ))
- fprintf( stderr, "Setting up to %s, size %ld bytes.\n",
- to, tmp_size );
- else
- fprintf( stderr,"Setting beyond %s, size %ld bytes.\n",
- dividers[ div_ct - 1], tmp_size );
-
- if( tmp_size > 60000 )
- too_big = TRUE ;
- fclose( fp_tmp );
-
- /* Build up the command line */
-
- if( tmp_size )
- {
- strncpy( buf, "sort ", 5 );
- pt = 5 ;
- if( reverse )
- {
- strncpy( &buf[ pt ], "/r ", 3 );
- pt += 3 ;
- }
- if( start_col > 1 )
- {
- sprintf( &buf[ pt ], "/+%d ", start_col );
- pt += 4 ;
- if( start_col > 9 ) /* to 2 digits */
- pt++ ;
- if( start_col > 99 ) /* to 3 digits */
- pt++ ;
- }
- buf[ pt ] = '\0' ;
- fprintf( fp_bat, "%s < %s >> %s\n", buf, fname, outnam );
- }
- fprintf( fp_bat, "del %s\n", fname );
- }
-
- fclose( fp_bat );
- if( !too_big )
- {
- errno = spawnl( 0, "sort2tmp.bat", "sort2tmp.bat", NULL );
- if( errno )
- fprintf( stderr, "spawnl error # %d\n", errno );
- }
-
- unlink( "sort2tmp.bat" );
- return ;
- }