Source Code 1994 March

home *** CD-ROM | disk | FTP | other *** search

/ Source Code 1994 March / Source_Code_CD-ROM_Walnut_Creek_March_1994.iso / compsrcs / misc / volume36 / unpost / part02 < prev next >

Wrap

Text File | 1993-04-18 | 60.6 KB | 1,719 lines

Newsgroups: comp.sources.misc,alt.binaries.pictures.utilities From: jstevens@teal.csn.org (John W.M. Stevens) Subject: v36i115: unpost - Smart multi-part uudecoder v2.1.2, Part02/07 Message-ID: <1993Apr19.052156.28711@sparky.imd.sterling.com> X-Md4-Signature: 97215d8e94a24d3e286cbbb240948847 Date: Mon, 19 Apr 1993 05:21:56 GMT Approved: kent@sparky.imd.sterling.com Submitted-by: jstevens@teal.csn.org (John W.M. Stevens) Posting-number: Volume 36, Issue 115 Archive-name: unpost/part02 Environment: UNIX, MS-DOS, OS/2, Windows, MacIntosh, Amiga, Vax/VMS #! /bin/sh # This is a shell archive. Remove anything before this line, then feed it # into a shell via "sh file" or similar. To overwrite existing files, # type "sh file -c". # Contents: lex.c segment.h unpost.doc # Wrapped by kent@sparky on Sun Apr 18 23:10:30 1993 PATH=/bin:/usr/bin:/usr/ucb:/usr/local/bin:/usr/lbin ; export PATH echo If this archive is complete, you will see the following message: echo ' "shar: End of archive 2 (of 7)."' if test -f 'lex.c' -a "${1}" != "-c" ; then echo shar: Will not clobber existing file \"'lex.c'\" else echo shar: Extracting \"'lex.c'\" $16470 characters$ sed "s/^X//" >'lex.c' <<'END_OF_FILE' X/****************************************************************************** X* Module : Lexical Analyzer --- Process the input text file into tokens X* that the configuration file parser can understand. X* X* Routines : Lex() - Return the next token from the file. X* OpenPrg - Open the back up program script file. X* X* Author : John W. M. Stevens X******************************************************************************/ X X#include "compiler.h" X X#include "lex.h" X X/* Type definitions for this file. */ Xtypedef struct key_st { X char c; X TKNS val; X struct key_st *branch; X} KEY; X X/* Constants local to this file. */ X#define NOT_FND -2 X X/* Object Data. */ Xstatic char word[80]; /* Last string analyzed. */ Xstatic int LnNo = 0; /* The current line number reading the script. */ Xstatic FILE *PrgFl; /* Pointer to the ASCII file that contains the X * backup program script. X */ X X/* Trie data structure containing all the keywords and punctuation marks for X* the backup language. X*/ Xstatic XKEY T8[2] = { X { ' ', 2, NULL }, X { 'e', T_ALTERNATE, NULL } X}; X Xstatic XKEY T7[2] = { X { ' ', 2, NULL }, X { 't', 0, T8 } X}; X Xstatic XKEY T6[2] = { X { ' ', 2, NULL }, X { 'a', 0, T7 } X}; X Xstatic XKEY T5[2] = { X { ' ', 2, NULL }, X { 'n', 0, T6 } X}; X Xstatic XKEY T4[2] = { X { ' ', 2, NULL }, X { 'r', 0, T5 } X}; X Xstatic XKEY T3[2] = { X { ' ', 2, NULL }, X { 'e', 0, T4 } X}; X Xstatic XKEY T2[2] = { X { ' ', 2, NULL }, X { 't', 0, T3 } X}; X Xstatic XKEY T1[2] = { X { ' ', 2, NULL }, X { 'l', 0, T2 } X}; X Xstatic XKEY Tb[2] = { X { ' ', 2, NULL }, X { 'y', T_BODY, NULL } X}; X Xstatic XKEY Ta[2] = { X { ' ', 2, NULL }, X { 'd', 0, Tb } X}; X Xstatic XKEY T9[2] = { X { ' ', 2, NULL }, X { 'o', 0, Ta } X}; X Xstatic XKEY Te[2] = { X { ' ', 2, NULL }, X { 'e', T_CASE, NULL } X}; X Xstatic XKEY Td[2] = { X { ' ', 2, NULL }, X { 's', 0, Te } X}; X Xstatic XKEY Tc[2] = { X { ' ', 2, NULL }, X { 'a', 0, Td } X}; X Xstatic XKEY T13[2] = { X { ' ', 2, NULL }, X { 'r', T_HEADER, NULL } X}; X Xstatic XKEY T12[2] = { X { ' ', 2, NULL }, X { 'e', 0, T13 } X}; X Xstatic XKEY T11[2] = { X { ' ', 2, NULL }, X { 'd', 0, T12 } X}; X Xstatic XKEY T10[2] = { X { ' ', 2, NULL }, X { 'a', 0, T11 } X}; X Xstatic XKEY Tf[2] = { X { ' ', 2, NULL }, X { 'e', 0, T10 } X}; X Xstatic XKEY T18[2] = { X { ' ', 2, NULL }, X { 'e', T_IGNORE, NULL } X}; X Xstatic XKEY T17[2] = { X { ' ', 2, NULL }, X { 'r', 0, T18 } X}; X Xstatic XKEY T16[2] = { X { ' ', 2, NULL }, X { 'o', 0, T17 } X}; X Xstatic XKEY T15[2] = { X { ' ', 2, NULL }, X { 'n', 0, T16 } X}; X Xstatic XKEY T14[3] = { X { ' ', 3, NULL }, X { 'd', T_ID, NULL }, X { 'g', 0, T15 } X}; X Xstatic XKEY T1d[2] = { X { ' ', 2, NULL }, X { 'r', T_NUMBER, NULL } X}; X Xstatic XKEY T1c[2] = { X { ' ', 2, NULL }, X { 'e', 0, T1d } X}; X Xstatic XKEY T1b[2] = { X { ' ', 2, NULL }, X { 'b', 0, T1c } X}; X Xstatic XKEY T1a[2] = { X { ' ', 2, NULL }, X { 'm', 0, T1b } X}; X Xstatic XKEY T19[2] = { X { ' ', 2, NULL }, X { 'u', 0, T1a } X}; X Xstatic XKEY T20[2] = { X { ' ', 2, NULL }, X { 't', T_PART, NULL } X}; X Xstatic XKEY T1f[2] = { X { ' ', 2, NULL }, X { 'r', 0, T20 } X}; X Xstatic XKEY T1e[2] = { X { ' ', 2, NULL }, X { 'a', 0, T1f } X}; X Xstatic XKEY T27[2] = { X { ' ', 2, NULL }, X { 's', T_SEGMENTS, NULL } X}; X Xstatic XKEY T26[2] = { X { ' ', 2, NULL }, X { 't', T_SEGMENT, T27 } X}; X Xstatic XKEY T25[2] = { X { ' ', 2, NULL }, X { 'n', 0, T26 } X}; X Xstatic XKEY T24[2] = { X { ' ', 2, NULL }, X { 'e', 0, T25 } X}; X Xstatic XKEY T23[2] = { X { ' ', 2, NULL }, X { 'm', 0, T24 } X}; X Xstatic XKEY T2d[2] = { X { ' ', 2, NULL }, X { 'e', T_SENSITIVE, NULL } X}; X Xstatic XKEY T2c[2] = { X { ' ', 2, NULL }, X { 'v', 0, T2d } X}; X Xstatic XKEY T2b[2] = { X { ' ', 2, NULL }, X { 'i', 0, T2c } X}; X Xstatic XKEY T2a[2] = { X { ' ', 2, NULL }, X { 't', 0, T2b } X}; X Xstatic XKEY T29[2] = { X { ' ', 2, NULL }, X { 'i', 0, T2a } X}; X Xstatic XKEY T28[2] = { X { ' ', 2, NULL }, X { 's', 0, T29 } X}; X Xstatic XKEY T22[3] = { X { ' ', 3, NULL }, X { 'g', 0, T23 }, X { 'n', 0, T28 } X}; X Xstatic XKEY T31[2] = { X { ' ', 2, NULL }, X { 'g', T_STRING, NULL } X}; X Xstatic XKEY T30[2] = { X { ' ', 2, NULL }, X { 'n', 0, T31 } X}; X Xstatic XKEY T2f[2] = { X { ' ', 2, NULL }, X { 'i', 0, T30 } X}; X Xstatic XKEY T2e[2] = { X { ' ', 2, NULL }, X { 'r', 0, T2f } X}; X Xstatic XKEY T21[3] = { X { ' ', 3, NULL }, X { 'e', 0, T22 }, X { 't', 0, T2e } X}; X Xstatic XKEY T35[2] = { X { ' ', 2, NULL }, X { 'l', T_TOTAL, NULL } X}; X Xstatic XKEY T34[2] = { X { ' ', 2, NULL }, X { 'a', 0, T35 } X}; X Xstatic XKEY T33[2] = { X { ' ', 2, NULL }, X { 't', 0, T34 } X}; X Xstatic XKEY T32[2] = { X { ' ', 2, NULL }, X { 'o', 0, T33 } X}; X Xstatic XKEY T0[12] = { X { ' ', 12, NULL }, X { 'a', 0, T1 }, X { 'b', 0, T9 }, X { 'c', 0, Tc }, X { 'h', 0, Tf }, X { 'i', 0, T14 }, X { 'n', 0, T19 }, X { 'p', 0, T1e }, X { 's', 0, T21 }, X { 't', 0, T32 }, X { '{', T_L_BRACE, NULL }, X { '}', T_R_BRACE, NULL } X}; X X/*----------------------------------------------------------------------------- X| Routine : TrieSrch() --- Search the trie for a token. X| X| Inputs : Keys - The trie level pointer. X| ch - The current character to search for. X| WordPtr - The pointer to the current byte of the word buffer. X| Outputs : The token number or TKN_NOT_FND for not found. X-----------------------------------------------------------------------------*/ X Xstatic Xint TrieSrch(KEY *Keys, X int ch, X char *WordPtr) X{ X register int mid; /* Mid point of array piece. */ X register TKNS ret; /* Return value of comparison. */ X X auto int lo; /* Limits of current array piece. */ X auto int hi; X X /* Make sure that input is lower case. */ X ch = tolower( ch ); X X /* Search for a token. */ X hi = Keys[0].val - 1; X lo = 1; X do X { X /* Find mid point of current array piece. */ X mid = (lo + hi) >> 1; X X /* Do character comparison. */ X ret = ch - Keys[mid].c; X X /* Fix the array limits. */ X if (ret <= 0) X hi = mid - 1; X if (ret >= 0) X lo = mid + 1; X X } while (hi >= lo); X X /* If the character matches one of the entries in this level and this X * entry has a child, recurse. If a match is found but the matching X * entry has no child, return the token value associated with the X * match. If the return value from the recursive call indicates that X * no match was found at a lower level, return the token value X * associated with the match at this level of the trie. X */ X if (ret == 0) X { X /* Save the current character. */ X *WordPtr++ = ch; X X /* Is this the last character in the string? */ X if ( Keys[mid].branch ) X { X /* Get the next character. */ X if ((ch = fgetc( PrgFl )) == EOF) X return( EOF ); X X /* Search next level. */ X if ((ret = TrieSrch(Keys[mid].branch, ch, WordPtr)) == T_NOT_FND) X { X ungetc(ch, PrgFl); X return( Keys[mid].val ); X } X return( ret ); X } X else X { X *WordPtr = '\0'; X return( Keys[mid].val ); X } X } X X /* Return not found. */ X *WordPtr = '\0'; X return( T_NOT_FND ); X} X X/*----------------------------------------------------------------------------- X| Routine : Lex() --- Get the next key word from the input file. X| X| Outputs : sym - The symbolic data read from the file. X| X| Return : Returns the token read or EOF. X-----------------------------------------------------------------------------*/ X Xint Lex(TOKEN *sym) X{ X register int tkn; X auto int ch; X extern FILE *ErrFile; X X /* Strip comments and white space. If the character read is a '#', X * every thing to the end of the line is a comment. X */ X ch = fgetc( PrgFl ); X while (ch == ' ' || ch == '\t' || ch == '\n' || ch == '#') X { X /* Process the special characters '#' and '\n'. */ X if (ch == '\n') X LnNo++; X else if (ch == '#') X { X while (fgetc( PrgFl ) != '\n') X ; X LnNo++; X } X X /* Get the next character. */ X ch = fgetc( PrgFl ); X } X X /* Get strings, etc. */ X if (ch == '"') X { X auto char *bf; X X /* Get contents of string. */ X bf = sym->str; X while ((ch = fgetc( PrgFl )) != '"' && ch != EOF) X *bf++ = ch; X *bf = '\0'; X X /* Return string token. */ X return( T_DBL_QUOTE ); X } X else if (ch >= '0' && ch <= '9') X { X /* Get the number. */ X sym->no = 0; X do X { X sym->no = sym->no * 10 + (ch - '0'); X } while ((ch = fgetc( PrgFl )) >= '0' && ch <= '9'); X X /* Return the unused character. */ X ungetc(ch, PrgFl); X return( T_INT_NO ); X } X else if (ch == EOF) X return( EOF ); X X /* Call the trie search routine to return the next token, EOF X * or NOT_FND. If not found, print an error and quit. X */ X if ((tkn = TrieSrch(T0, ch, word)) == T_NOT_FND || tkn == 0) X { X fprintf(ErrFile, X "%s %d : Error - cannot identify string '%s' ", X __FILE__, X __LINE__, X word); X fprintf(ErrFile, X "in line %d\n", X LnNo + 1); X exit( 1 ); X } X X /* Return the token found. */ X return( tkn ); X} X X/*----------------------------------------------------------------------------- X| Routine : OpenCfg() --- Open the ASCII text file that contains the X| configuration data. X-----------------------------------------------------------------------------*/ X Xvoid OpenCfg(char *FileNm) X{ X extern FILE *ErrFile; X X /* Open the program script file. */ X if ((PrgFl = fopen(FileNm, TXT_READ)) == NULL) X { X fprintf(ErrFile, X "%s %d : Error - %s\n", X __FILE__, X __LINE__, X sys_errlist[errno]); X fprintf(ErrFile, X "\tFile Name: '%s'\n", X FileNm); X exit( 1 ); X } X} X X/*----------------------------------------------------------------------------- X| Routine : CloseCfg() --- Close the ASCII text file that contains the X| configuration data. X-----------------------------------------------------------------------------*/ X Xvoid CloseCfg(void) X{ X fclose( PrgFl ); X} X X/*----------------------------------------------------------------------------- X| Routine : ParseErr() --- Report a parse error. X| X| Inputs : ErrStr - The error string. X-----------------------------------------------------------------------------*/ X Xvoid ParseErr(char *ErrStr) X{ X extern FILE *ErrFile; X X fprintf(ErrFile, X "%s %d : Error - %s\n", X __FILE__, X __LINE__, X ErrStr); X fprintf(ErrFile, X "\tLine %d, word '%s'\n", X LnNo + 1, X word); X exit( 1 ); X} END_OF_FILE if test 16470 -ne `wc -c <'lex.c'`; then echo shar: \"'lex.c'\" unpacked with wrong size! fi # end of 'lex.c' fi if test -f 'segment.h' -a "${1}" != "-c" ; then echo shar: Will not clobber existing file \"'segment.h'\" else echo shar: Extracting \"'segment.h'\" $469 characters$ sed "s/^X//" >'segment.h' <<'END_OF_FILE' X/****************************************************************************** X* Module : Segmemt header file. X* X* Author : John W. M. Stevens X******************************************************************************/ X X#if ! defined(SEGMENT_HEADER_FILE) X#define SEGMENT_HEADER_FILE X X/* Function prototypes. */ Xextern Xvoid Single(char *FileNm); Xextern Xvoid Multiple(char *FileNm); Xextern Xvoid UUDecode(char *FlName); X X#endif END_OF_FILE if test 469 -ne `wc -c <'segment.h'`; then echo shar: \"'segment.h'\" unpacked with wrong size! fi # end of 'segment.h' fi if test -f 'unpost.doc' -a "${1}" != "-c" ; then echo shar: Will not clobber existing file \"'unpost.doc'\" else echo shar: Extracting \"'unpost.doc'\" $40962 characters$ sed "s/^X//" >'unpost.doc' <<'END_OF_FILE' XUNPOST X XName: X X unpost - Extract binary files from multi-segment uuencoded USENET X postings or Email. X XSynopsis: X X unpost [-b[-]] [-c <configuration file>] [-d[-]] [-e <error file>] X [-f[-]] [-h|-s|-u] [-i <incompletes file>] [-t <text file>] X <source file> X X Where everything but the source file is optional. X XDescription: X X UNPOST is a tool designed primarily to extract binaries from USENET X binaries postings such as those made to alt.binaries.pictures.misc X and comp.binaries.ibm.pc. As well as extracting binaries from USENET X postings, UNPOST can extract binaries from multi-segment uuencoded X mailings as well, however, to simplify this documentation only X USENET article postings will be discussed. The principles are the X same for multi-segment mailings. X X To avoid confusion, this documentation will refer to a single letter X OR article as a 'segment'. For clarification on what a segment means X to UNPOST, see Theory of Operations. X XFeatures: X X1) PORTABILITY! UNPOST has been compiled and sucessfully run on X MS-DOS, OS/2, Windows, Unix workstations, MacIntoshes, Amiga's X and VAX/VMS systems. X X The code is written to be pure ANSI C, within reasonable limits. X (some ANSI C capabilities are not used where they would be X appropriate due to lagging compliance in most compilers. Hey, X Unix types! MS-DOS (Borland C++ 3.1) is a MUCH better compiler X than anything I've seen on a Unix workstation! And their debugger X is the best I've used, as well). Unfortunately, there are still X a lot of Unix boxes that have only a K&R compiler, so it may X not port well to those. I personally check to make sure that it X will compile and run on an MS-DOS box running MS-DOS 5 and Windows X 3.1, using the Borland 3.1 C++ compiler, as well as a Sun (running X SunOs 4.1.1 sun4c) using the gcc compiler (version 2.1). I know X for a fact that the Sun cc compiler will NOT compile UNPOST X succesfully. X X K&R compatibility is being considered, but it is a low priority X feature. X X2) CONFIGURABILITY! UNPOST comes with a default set of rules for X detecting and parsing a VERY wide range of possible Subject: X line formats, but no configuration can be correct for every X situation. X X With that in mind, UNPOST can be configured by the user by creating X a text file that contains the regular expressions, etc. that X UNPOST uses to recognize, parse, etc. WARNING! UNPOST depends X almost ENTIRELY on the contents of it's configuration file for X correct operation. X X Regular expressions are complex, and writing one that works the X way you expect it to takes care and, most importantly, X experimentation. X X To this end, the standard UNPOST installation creates both the X UNPOST executable and a regular expression test program called X RETEST. RETEST is like grep, feed it a regular expression and X a file, and RETEST will tell you what it matched and the sub X strings that it extracted. X X3) INTELLIGENCE! UNPOST uses every trick in the book to try to X guess what the poster/sender REALLY meant. X X Also, UNPOST is not limited to finding all of it's information X on a single line, or even in the header of a posting/letter. X X UNPOST has succesfully extracted binaries from postings that had, X as a subject line, X X Subject: aaaa X X because UNPOST recognized the signature placed into the body of X the article by a uuencode/split program. X X4) FLEXIBILITY! UNPOST has switches that allow it to be configured X to do different things for different tastes. For instance, UNPOST X will intelligently sort out articles into four different classes: X X 1) Articles that are part of a complete and correct binary in X the input file. These are sorted, concatenated, uudecoded X and written out to a file name that is the same as that X on the uuencode begin line. X X Depending on the setting of the file name switch, the file X name of the binary may be modified. See below. X X 2) Articles that are pure text (no uuencoded data in them). X X If the -t switch and a file name are specified, these X articles will be written out to the file for reading. X X Obviously, these articles should NEVER be encountered in X a binaries news group, but not a single day has ever gone X by that I did not see non-binary postings to binary news X groups. X X 3) Articles that are part of incomplete postings (four parts, X but only three have shown up so far), or that comprise X a complete binary, but one that had an error in uudecoding, X interpretation, etc. X X If the -i flag and a file name are specified, these articles X will be written out to the file. If the -b switch is X on, incompletes will be written to separate files. If X both are on, those incompletes that can be guessed at X as having a file name will be written to a separate file, X all else will be written to the file named by the -i X switch. X X In my experience, two types of articles end up in an X incompletes file, those that have missing parts, and X those that have been misinterpreted by UNPOST as belonging X to a different binary than they really do. X X 4) Articles that are pure text that describe a posting X (these are usually found only in the pictures groups). X X If the -d flag is set, and the binary to which they X belong is correct and complete, this article, as well as X the header and body up to the uuencode begin line of the X first article, will be written to a file that has the same X base name as the binary, but with the extension .inf. X X UNPOST automatically mungles binary file names to be MS-DOS X compatible (the lowest common denominator). This is switch X controllable, and can be turned on or off (depending on the X default setting selected by the person who compiled UNPOST). X X UNPOST also has two lesser modes, sorted mode and uudecode mode. X X In sorted mode, UNPOST assumes that the articles still have X headers, and that there may be un-uuencoded lines in the middle X of a uuencoded file that have to be filtered out, but it assumes X that all parts are present, and that they are in order. Header X information, however, is ignored. X X If you use the incompletes file capability of UNPOST, you will X notice that it writes out the segments that it did interpret X correctly in sorted order. X X In uudecode mode, UNPOST acts like a simple uudecoder. UUencoded X files must be complete, with a begin and end line, and no X un-uuencoded lines can appear between the begin and end lines. X X However, uudecode mode is the ONLY mode where UNPOST will accept X a short line (one that was space terminated, but had the spaces X chopped off) as a legal uuencoded line and properly decode it. X X5) INFORMATIVE! UNPOST is a very talkative program. It detects X and reports many kinds of problems, tells you what it thinks X is going on, and tells you what it is doing. All this information X is written to standard error, or if the -e switch and a file X name are specified, written to that file. X XTheory of Operations: X X UNPOST assumes that the source file that is given to it will have the X following format: X X SEGMENT begin line X ... X HEADER ID line X ... X BODY ID line X ... X UUENCODED line X X Where the lines are: X X SEGMENT begin line - Is the line that identifies the begining of a X segment. X HEADER ID line - One or more lines that contain segment number, X total number of segments or the ID string in X the article or mail header. X BODY ID line - One or more lines that contain segment number, X total number of segments or the ID string in X the article or mail message body. X UUENCODED line - Is the first uuencoded line in the file. X UUencoded lines include the begin and end lines. X ... - Indicates zero or more lines that can contain X any information so long as they CANNOT be X misidentified as SEGMENT begin, ID or UUENCODED X lines. X X Notice that the ID information can be spread across multiple lines. A X segment is assumed to end at the begining of the next segment, or at X the end of the source file. An UNPOST source file contains one or more X segments. X X UNPOST has three different modes, interpretation mode, concatenation X mode and UU decoder mode. In all three modes, UNPOST can accept one X or more input files. X X In the first mode, interpretation mode, UNPOST looks at segment header X and body lines before the first UU encoded line, and attempts to extract X three pieces of information from them: segment number, total number X of segments that the binary was split into, and an ID string that is X common to all segments. If UNPOST finds something that it considers X to be an ID string, and a uuencoded line in the segment, but it does X not find a segment number and number of segments, UNPOST assumes that X the segment is a single segment binary posting (part 1 of 1). X X To aid in finding out what happened, in interpretation mode UNPOST X will write a list of all the different ID strings and their respective X segment lists to standard error or the file specified as the error X file (see Standards section for details of what an ID string is). X Any errors or warnings detected during processing will also be X written to standard error or error file. X X In interpretation mode three other files can optionally be created. X All three of these files will contain segments copied out of the source X file, and none of these files will be created unless they are turned X on and named by a command line switch. X X The first optional file that UNPOST can create for the user in X interpretation mode is the text file (-t switch). This file will have X copied to it all segments from the source file that do not contain X uuencoded data. X X Segments that are part 0/# type segments that do not contain uuencoded X data will NOT be copied to the text file. They are considered to be X description segments, and they will be copied to the description file X only if the -d switch is turned on. Also, all binary postings that X have all of their segments present will have the segment header X and body of segment #1 (up to and including the uuencode begin line) X copied into the description file. X X The third optional file that can be created in interpretation mode is X the incomplete or unused uuencode data segments file. This file X contains all segments that have uuencoded data, that were not used in X a succesful uudecoding. This file will only be created if the -i X switch is present. X X The incompletes file allows the user to hand decode those binarys which X could not be interpreted or decoded by UNPOST. Often times, a binary X will have all of it's parts, but UNPOST will not be able to put them X together because of differences in the ID string between segments, or X problems with the part numbering information. The simplest way to X solve these problems is to collect the incompletes, edit the ID X lines to correct the problem, and rerun UNPOST on the incompletes X file. X X In the second mode, catentation mode, UNPOST assumes that all of the X segments in the source file between a uuencode begin and a uuencode X end line are part of one binary posting and that the segments are in X order. UNPOST scans from the begining of the file until it finds a X uuencode begin line, and decodes from there (skipping over non- X uuencoded lines such as segment header lines and signatures) until X it finds a uuencode end line. X X In the last mode, UU decoder mode, UNPOST assumes that the source X file contains one or more UU encoded files. Only UU encoded lines X are allowed between the uuencode begin line and the uuencode end line X of any single uuencoded file. X X Example header: X X (1) Article 2096 of alt.binaries.pictures.misc: X Newsgroups: alt.binaries.pictures.misc X Path: csn!csn!convex!cs.utexas.edu! X From: a43xz@brain.ac.da (Joe User) X (2) Subject: ship.gif (1/3) X Organization: Somewhere Near The Sea. X Date: Fri, 19 Feb 1993 06:43:48 GMT X Message-ID: <21128@brain.ac.da> X Sender: news@dep.rnsft.ac.da (Usenet) X Lines: 761 X X X Picture of a ship in a bottle, full rigged. How did it get there? X X (3) section 1 of uuencode 5.20 of file ship.gif by R.E.M. X X (4) begin 644 ship.gif X M1TE&.#=A@`+@`9<```0$!`0$!",G,"LG)RLG,"LG.30G,#TG)RLP,"LP.3TG X X In the above example, line (1) is the SEGMENT begin line, line (2) is X a HEADER ID line, line (3) is a BODY ID line and line (4) is the first X UUENCODED line in the body. X XOptions: X X -b[-] Set this flag to make UNPOST write the incomplete X uuencoded segments to separate files. This defaults X to off. X X -c <file> To read and use a different configuration than the X default configuration. The default configuration is X stored in a file called def.cfg. X X -d[-] Turns on description capturing and writes descriptions X to a file that has the same name as the output but with X a .inf extension. This defaults to on. X X -e <file> Redirects error and information output from standard X error to <file>. X X -f[-] Modify file names to be MS-DOS/USENET compatible. Use X of -f turns file name modification on if the default is off, X and -f- turns file name modification off if the default X is on. File name modification is currently the default. X X -h Turns on full interpretation mode. This is the default. X X -i <file> Turns on incomplete binaries capturing and writes the X segments to file <file>. X X -s Switch to ordered segment mode. This mode ignores segment X headers, and assumes that the segments are in order. X X -t <file> Turns on text only segment capturing and writes the segments X to <file>. X X -u Switch to uudecoder mode. Assume only uuencoded data X between begin and end lines. Multiple uuencoded files X are allowed. X X -v Show version number and quit. X X -? Show a summary of the command line switches. X X It is important to realize that UNPOST parses the command line in X parallel with operations, so the order of the switches on the X command line is VERY important. For example: X X unpost -d -e errors -i abpm.inc abpm.uue -c cbip.cfg -d- cbip.uue X X This will use the default configuration to process the file abpm.uue, X writing out description files, writing errors to the file errors, X and writing incompletes to the file abpm.inc. After UNPOST finishes X processing abpm.uue, it will read in the cbip.cfg configuration, X turn off writing description files and process cbip.uue. X X Note that the errors will continue to be written to the file errors, X and that the incomplete binaries will continue to be written to the X file abpm.inc. Since we are switching configurations, this is X probably not a good idea. X XStandards: X X In all modes, UNPOST recognizes and decodes only uuencoded data. X X In interpretation mode UNPOST requires that: X X 1) The uuencoded lines be true uuencoded lines. This means X that if trailing spaces are truncated by a mailer, editor X or news node, UNPOST will not consider those lines to X be uuencoded lines. Also, the uuencode character set X recognized by UNPOST is ' ' - '`', with no other characters X being legal. X X 2) That all segments of the same binary file posting have X the same, recognizable ID string. X X 3) Segments have a recognizable SEGMENT begin line as the X first line in the segment (denoting the begining of a X segment). X X 4) That all ID lines follow the SEGMENT begin line in the X segment. X X 5) That the first UUencoded line of the segment follows the X last ID line. X X 6) That the first uuencode line in the first segment be a X begin line. X X 7) That the last segment contain a uuencode end line. X X In sorted segment mode, UNPOST requires that: X X 1) The uuencoded lines be true uuencoded lines. This means X that if trailing spaces are truncated by a mailer, editor X or news node, UNPOST will not consider those lines to X be uuencoded lines. Also, the uuencode character set X recognized by UNPOST is ' ' - '`', with no other characters X being legal. X X 2) That the segments be stored in the file in order. X X 3) That the first uuencode line in the first segment be a X begin line. X X 4) That the last segment contain a uuencode end line. X X In uudecoder mode, UNPOST requires that: X X 1) There be only uuencoded lines between a uuencode begin and X a uuencode end line. In this mode, UNPOST will recognize X and attempt to repair lines that had trailing spaces X truncated. X XExamples: X X To extract a single binary that had all of it's segments saved in order X to a single file: X X unpost -s binary.uue X X To extract all binaries that have had all of their segments saved X to a single file: X X unpost multiple.uue 2> errors X Or X unpost -e errors multiple.uue X X The file errors will contain a list of all the ID strings that UNPOST X found and thought could have been binary files, and any errors X that occurred during processing. X X To capture the incomplete or unused segments that have uuencoded X data in them: X X unpost -e errors -i multiple.inc multiple.uue X X To capture descriptions and text only segments as well: X X unpost -d -e errors -t text -i multiple.inc multiple.uue X X To process two different files, one in uuencode mode, one in interpretation X mode: X X unpost -e errors -u uuencode.uue -h multiple.uue X X To process a file that requires a different configuration: X X unpost -c -e errors multiple.uue X XOutput: X X UNPOST will write diagnostic and informative messages to either X standard error or the error file. The error file has three X parts, interpretation errors (duplicate segments, missing X uuencode begin lines, missing ID string, segment number or X number of segments, etc.), a dump of the binaries found, the X number of segments in each binary and the segment number and X offset of each segment in the source file. The last part X is a mixture of information (the name of the binary that UNPOST X is attempting to decode) and any errors encountered during X decoding. X X In the example below, UNPOST found one segment that had uuencoded X data, the Subject: line had barber.gif as the ID string, the X binary has one segment, and in the list of segments below, X we see that segment number 1 starts at offset 583 in the source file. X X If there is a missing segment, it's segment number will be zero, X and it's file offset will be zero. X X There were no interpretation errors, and there were no decoding X errors. X X File ID Segments X ---------------------------------------- X barber.gif 1 X 1 583 X X Decoding Binary ID: 'barber.gif' X XNotes: X X To use this program to collect all of the binaries posted to, say, X the alt.binaries.misc group on a daily basis, start up rn, go to X the alt.binaries.misc newsgroup, and save all of the unread segments X by using this command: X X .-$smisc.uue:j X X This will save all segments from the current number to the last to X the file misc.uue, then junk them. After exiting rn, run UNPOST X on the file misc.uue in interpretation mode (default mode): X X unpost -e errors -i misc.1 misc.uue X X Make sure to check the errors and/or misc.1 file for segments X that UNPOST couldn't extract. X XDiagnostics: X X Error - file 'filename' already exists. X X UNPOST will not overwrite an existing file. Delete the file or X rename it and try again. X X Error - missing begin line. X X UNPOST expected to find a uuencode begin line in this segment, X but did not. X X Error - missing file name. X X The binary that UNPOST was attempting to decode does not X seem to have a uuencode begin line in the first segment, X so UNPOST has no idea what the file name is. X X Error - Could not open description file 'filename' for writing. X X UNPOST could not open a file of that name for some reason. X Possibly a permission problem, or the file exists and is not X writeable. X X Error - Bad write to binary file. X X A file write failed for some unknown reason. Possibly a full X disk? X X Error - missing segment # X Binary ID: 'binaryID' X X In attempting to decode a file whose ID string is binaryID, X one or more segments are missing. X X Error - Missing UU end line. X X As this is the last segment, it ought to have a uuencode end X line in it, but UNPOST did not find one. X X Warning - Early uuencode end line. X X UNPOST found a uuencode end line, but this was not the last X segment, so we found it early. Did the poster screw up and X misnumber his segments? X X Error - Unexpected UU begin line. X X We found an unexpected (read: this is not the first line of the X first segment, so what is this doing here?) UU begin line. X X Error - cannot identify string '' in line # X X In reading in a configuration file, the configuration file X lexical analyzer could not recognize this string. X X X Error - Out of memory. X X Yup. Out of memory. Split the source file into smaller X pieces and try again. X X Error - Could not modify file name to be MS-DOS conformant. X X File name mungling is turned on, and the name of one of the X files cannot be made conformant (probably due to having to X many numbers in it). X X Warning - Unexpected end of file in segment: X Segment: 'segment line' X X File name mungling is turned on, and UNPOST is attempting to X identify the file type (so it can use the proper extension X when modifying the file name) but the UU begin line was the X last line in the file. X X Warning - No UU line after begin. X Segment: 'segment line' X X File name mungling is turned on, and UNPOST is attempting to X identify the file type (so it can use the proper extension X when modifying the file name) but the UU begin line was not X followed by a line of UU encoded binary data. X X Error - Got number of segments but not segment number. X Error - Got segment number but not number of segments. X X UNPOST must have all three pieces of relevant data, but if X UNPOST has at least an ID string, UNPOST will attempt to X assume a one part binary. X X Error - Could not get ID string. X X Fatal error, with no ID string, there is no way to collect X the pieces together. X X Error - No begin line in first segment: X Segment: 'segment line' X X UNPOST did not find a UU begin line in the first segment. X X Error - missing '}' in regular expression. X X In a regular expression of the type abc{1, 2}, the closing curly X brace is missing. X X Error - To many sub-expressions. X X UNPOST has a limit on the number of sub-expressions it X allows. This is a compile time option that can be changed X by modifying the value of MAX_SUB_EXPRS in regexp.h. X X Error - missing ')' in regular expression. X X Mismatched parentheses. X X Error - badly formed regular expression. X Unexpected character 'c' X X I give up! What is this character doing at this point in X a regular expression? X X Error, can not enumerate a sub expression. X X Regular expressions of the type: (...)* are not allowed. X X Error - illegal regular expression node type. X X Whoops, we have an internal programmers error here. Let X me know if you see this. X X Error - Sub expression # extraction failed. X X Another internal error that needs to be brought to my attention. X X Error - could not open file 'filename' for reading. X X UNPOST could not open file 'filename' for processing. Did you X spellit right? X X Error - Unexpected end of file. X X Error - Unexpected UU begin line. X X Error - Segment number # greater than number of segments in: X Segment: 'segment line' X X Either UNPOST got screwed up somehow or the poster posted X something like (Part 10/9). X X Warning - duplicate segment # in: X Binary ID: 'binaryID' X X UNPOST found two segments with the same binary ID and the X same segment number. X X Error - reading source file. X X Could not read a line from the source file. X X Error - Could not open file 'filename' for output. X X Could not open one of the text, incomplete or error files X for writing. X XRegular Expressions: X X Operands X -------- X X UNPOST regular expressions have three types of operands, character X strings (one or more characters), character sets and match any X single character. A character string is any series of adjacent X characters that are not not meta-characters (special characters). X A data set is a string of characters enclosed in square braces with X an optional caret (^) as the first character following the open X square brace. The match any character operand matches any single X character except the end of line character. X X A character string in a regular expression matches the exact string X in the source, including case. X X Example of character strings: X X AirPlane - Matches the string 'AirPlane', but not the strings X 'airPlane' or 'Airplane'. X X A character set will match any single character in the source if X that character is a member of the set. If the first character X of the set is the caret, the character set will match any X character that is NOT a member of the set (including control X characters!) except for NUL and LF. X X A character set can be described using ranges. X X Examples of character sets: X X [abcd] - Matches either a, b, c or d. X X [0-9] - Matches any decimal character. X X [^a-z] - Matches any character that is NOT a lower X case alphabetic. X X The match any character operand does just that, it matches any X character. But it does not match the case of no character, NUL X or LF. X X Example of match any character: X X . - Matches any character. X X Operators X --------- X X UNPOST regular expressions also contain operators. The operators that X upost recognizes are the alternation operator, the span operators, the X concatenation operator and the enumeration operators. X X The alternation operator has the lowest precedence of all the operators X and its action is to attempt to match one of two alternatives. X X Example of alternation: X X Airplane|drigible - Matches either the string Airplane or the string X drigible. X X The next higher precedence operator is the catenation operator. The X catenation operator specifies that both the left and right hand X regular expressions must match. The catenation operator does not X have a special character, it is assumed to exist between two X different operands that have no other operator between them. X X Example of catenation: X X [Aa]irplane - Matches either a 'A' or an 'a' followed by the string X irplane. This is a catenation of the two regular X expressions [Aa] and irplane. X X The next higher precedence operator is the enumeration operator. X The enumeration operator specifies how many instances of a regular X expression must be matched. X X Examples of Enumeration: X X abc* - Matches zero or more occurences of the string abc. X [A-Z]+ - Matches one or more occurences of an upper case X alphabetic character. X [ ]? - Matches zero or one occurences of the space character. X very{1} - Matches one or more occurences of the string very. X b{1,3} - Matches a minimum of one to a maximum of three occurences X of the string b. X X An enumeration operator attempts to match the largest source sub- X string possible, except in the case of the . (match any character) X followed by an enumeration operator. In this case, the smallest X possible sub-string is matched. X X The precedence of the operators can be modified with the use of X parentheses. Parentheses have another meaning as well, described X below. X X Example of parenthesis use: X X Death( defying|wish) - Will match either the string 'Death defying' X or the string 'Deathwish'. Without the X parentheses, the regular expression would X match either the string 'Death defying' X or the string 'wish'. X Sub Expressions X --------------- X X UNPOST regular expressions are used primarily for identifying a X particular line and extracting substrings from that line. To X this end, UNPOST regular expressions support sub-expression X marking. Subexpressions are marked by parentheses. X X To determine the sub-expression number of a sub-expression, scan X the regular expression from left to right, counting the number X of left parentheses. Start with one, and whatever the count for X that sub-expression, is it's subexpression number. X X Example: X X .*((abcd)((0-9)+/(0-9)+)) X X Sub-expression ((abcd)((0-9)+/(0-9)+)) is sub-expression #1. X Sub-expression (abcd) is #2. Sub-expression ((0-9)+/(0-9)+) is #3. X Sub-expression (0-9)+ is #4. Sub-expression (0-9)+ is #5. X X Anchoring X --------- X X Normally, a regular expression will match a sub-string any where in X the source string. If you want to specify that the matching sub-string X must start at the begining of the source string, you may use a caret X character as the first character of the regular expression. This X anchors the regular expression match to the start of the line. X X To anchor a regular expression to the end of a line, use the dollar X sign character. This effectively matches the end of line or end X of string character. X X Anchor operators have a higher precedence than alternation, but lower X than catenation. X XConfiguration: X X Ok, here's how to configure UNPOST to work for you. UNPOST relies X heavily on regular expressions. These regular expressions may X not be correct for your news reader, or system. X X There are five classes of regular expressions: X X 1) The SEGMENT begin line regular expression. X X 2) The ID line prefix regular expression. X X 3) The ID line with part description regular expression list. X X 4) The begin line regular expression. X X 5) The end line regular expression. X X Of these five, I don't expect you to have to modify the regular X expressions for handling begin and end lines, because they should X be correct for all uuencoders that follow the standard format. X X Be aware that UNPOST has a hierarchy of regular expressions. X Each SEGMENT begin line regular expression has underneath it two X lists of regular expressions that recognize ID line prefixes, X and each element in the list of ID line prefix regular expressions X has a list under it that attempts to parse the ID line. X X The two lists immediately under the SEGMENT begin line regular X expression are for 1) the header and 2) the body. X X The ID line prefix regular expression exists for the sake of X efficiency. It is used to find an ID line before we attempt X to parse it. Modify or add one of these if you wish to change X whether or not a line is recognized by UNPOST as being an ID line. X If you modify this, you must modify the list of segment description X regular expressions to match. X X The SEGMENT begin line regular expressions are used to find the begining X of a SEGMENT, or the end of a previous segment. Modify these to change X the line or lines that UNPOST recognizes as the begining of a segment. X X If you get an error message that indicates that the Subject line X has no identifiable part description, and you see that some bright X boy/girl has come up with a brand new part description format, then X you have two choices, modify the source and hope they don't post X again, or add a new ID line regular expression to the list of X ID line regular expressions in the segment.c source file. X X Be aware that the lists of regular expressions are searched in order X from top to bottom to find a match. This means that less specific X regular expressions should be placed later in the list. For example: X the regular expression '$(0-9)+/(0-9)+$' should come before the X regular expression '(0-9)+ (0-9)+' in the part syntax parsing regular X expression list. This reduces the number of misparses that occur. X X Remember that C uses the backslash (\) as an escape character in X strings, so to put a backslash into a regular expression you X need to put two into the C source string. X X All regular expressions can be found at the top of the parse.c source X file. Before you modify the actual source code and recompile, I X strongly suggest that you test your new regular expression using the X regular expression test harness (retest) that was compiled by the X makefile when you compiled UNPOST. Then, when you are sure that X it is correct, copy the def.cfg file to a new name, make your changes X there and use that configuration file for a while. If after all this, X you are sure that it works, go in and change the source code in X parse.c. X X Before you add or modify a regular expression, you have to know the X syntax of the regular expressions used in this program. The syntax X is very similiar to that used by UN*X style regular expressions, X but is not exactly the same. See the section titled Regular X Expressions before attempting to configure UNPOST. X XConfiguration Files: X X If you don't want to make permanent changes to UNPOST's configuration, X you can make a configuration file. Configuration files are parsed by X UNPOST, the regular expressions compiled, and these regular expressions X control the operation of UNPOST completely. X X A configuration file must have the following syntax: X X segment "..." X { X header X { X "...." X { X "..." X { X id # X segment number # X segments # X alternate id # X case ignore|sensitive X } X } X } X body X { X "...." X { X "..." X { X id # X segment number # X segments # X alternate id # X case ignore|sensitive X } X } X } X } X X Where "..." is a regular expression string, # is a sub expression X number (See the section on regular expressions), and case is X either ignored in regular expression string matching, or string X matching is case sensitive. X X The outer most construct, starting with the segment "..." line X is used to describe how to recognize the begining of a segment. X X The two constructs at the first level within the segment construct X are used to identify lines that are expected to contain part # of X # of parts information in the header, or the body of an article. X X Within each header or body group are regular expressions that X are used to parse out the part # of # of parts information from X an identified information line. X X A very simple example (taken directly out of the MUFUD X documentation): X X segment "^Article[:]?" X { X header X { X "^Subject:" X { X "^Subject:(.*)part[ ]+([0-9]+)[ ]*(of|/)[ ]*([0-9]+)(.*)" X { X id 1 X segment number 2 X segments 4 X alternate id 5 X case ignore X } X "^Subject:(.*)([0-9]+)[ ]*(of|/)[ ]*([0-9]+)(.*)" X { X id 1 X segment number 2 X segments 4 X alternate id 5 X case ignore X } X } X } X body X { X } X } X X Where: X X id 1 Specifies the sub expression number of the X sub expression that is used to extract the X binary ID string. X X segment number 2 Specifies the sub expression number of the X sub expression that is used to extract the X segment number. X X segments 3 Specifies the sub expression number of the X sub expression that is used to extract the X number of segments that this binary was split X into. X X alternate id 4 Specifies the alternate sub expression X number. If the first ID sub expression extracts X only an empty string (or one with all white X space), the string extracted by this sub expression X is used to generate the binary ID string. X X case ignore Specifies that the case of alphabetical X characters is to be ignored in regular X expression string matching. X X See the def.cfg file for another (more complete) example. X XDefault Binary Switch Settings: X X To modify the default values of the binary switches, edit the X file compiler.h, and change the value of the defines. There X are defines for file name mungling, breaking incompletes X into separate files and for dumping out description files. X XBugs: X X This program has been pretty extensively tested in interpretation mode, X and it appears to be both robust and flexible. X X Unfortunately, about once a week, somebody comes up with a new and X unusual way to encode the parts description on the Subject line. X X Bugs, after being found, are chased unmercifully and terminated with X extreme prejudice. If you think you've found one, send all information X opinions, prejudices and critcisms to me, and the hunt will begin X (just as soon as I can put on my safari hat and grab my debugger. . .). X XRights, Copyright, Legal stuff, etc: X X This program is distributed free of charge, but it has NOT been placed X in the public domain! I retain copyright. X X Why? Because I am pathologically commited to producing and maintaining X a quality product, and if every Tom, Dick and Susan modifies UNPOST X and redistributes, I will not be able to respond to bug reports or X continue to upgrade the product (branch revision problem, and all X that. . .). X X My job isn't done so long as a single bug exists, or even one user X is unhappy (or even one system is uninfected. . . er, that is to X say, supported :-). X X However, I am also dedicated to the principle of maximum use. If X you wish, you may modify anything in this program you want to. That's X why I distribute source. BUT, you may NOT distribute your changes, X unless you can be legally sure that you have made so many as to make X what you distribute a new work. X X If you learn any bad habits from reading my source code, tough luck. X X And if anything in this section is not legally supportable, the X joke's on me. Don't bother telling me, I'm to busy coding (So THERE!). X XAuthor: X X John W. M. Stevens - jstevens@csn.org END_OF_FILE if test 40962 -ne `wc -c <'unpost.doc'`; then echo shar: \"'unpost.doc'\" unpacked with wrong size! fi # end of 'unpost.doc' fi echo shar: End of archive 2 $of 7$. cp /dev/null ark2isdone MISSING="" for I in 1 2 3 4 5 6 7 ; do if test ! -f ark${I}isdone ; then MISSING="${MISSING} ${I}" fi done if test "${MISSING}" = "" ; then echo You have unpacked all 7 archives. rm -f ark[1-9]isdone else echo You still must unpack the following archives: echo " " ${MISSING} fi exit 0 exit 0 # Just in case...