home *** CD-ROM | disk | FTP | other *** search
- /*
- rtf2text - read rtf input, write text of document (text extraction).
-
- This installs callbacks for the ascii and control token classes.
- The control class is necessary so that special characters such as
- \par, \tab, \sect, etc. can be converted.
-
- It's problematic what to do with text in headers and footers, and
- what to do about tables.
-
- This really is quite a stupid program, for instance, it could keep
- track of the current leader character and dump that out when a tab
- is encountered.
-
- 04 Feb 91 Paul DuBois dubois@primate.wisc.edu
-
- 04 Feb 91 V1.0. Created.
- 27 Feb 91 V1.01. Updated for distribution 1.05.
- */
-
- # include <stdio.h>
- # include "rtf.h"
-
-
- /*
- structure for mapping character values >= 128 to text strings
- for different character sets.
- */
-
- typedef struct CharMap CharMap;
-
- struct CharMap
- {
- int charVal;
- char *charStr;
- };
-
- extern CharMap ansiCharMap[]; /* these are defined below */
- extern CharMap macCharMap[];
- extern CharMap pcCharMap[];
- extern CharMap pcaCharMap[];
-
- /*
- Default is ANSI but I hope we don't see \ansi, since its char
- map is empty...
- */
-
- CharMap *charMap = ansiCharMap;
-
-
- static void Text ();
- static void Control ();
- static void CharSet ();
- static void Destination ();
- static void SpecialChar ();
-
-
- int main (argc, argv)
- int argc;
- char **argv;
- {
- RTFInit ();
-
- --argc;
- ++argv;
-
- /* not clever; only allows stdin or one named file to be read */
-
- if (argc > 0)
- {
- if (freopen (argv[0], "r", stdin) == (FILE *) NULL)
- {
- fprintf (stderr, "Can't open \"%s\"\n", argv[0]);
- exit (1);
- }
- }
-
- /* install class callbacks and process the input stream */
-
- RTFSetClassCallback (rtfText, Text);
- RTFSetClassCallback (rtfControl, Control);
- RTFRead ();
-
- exit (0);
- }
-
- static void Text ()
- {
- PutChar (rtfMajor);
- }
-
-
- static void Control ()
- {
- switch (rtfMajor)
- {
- case rtfCharSet:
- CharSet ();
- break;
- case rtfDestination:
- Destination ();
- break;
- case rtfSpecialChar:
- SpecialChar ();
- break;
- }
- }
-
-
- static void CharSet ()
- {
- switch (rtfMinor)
- {
- case rtfAnsiCharSet:
- charMap = ansiCharMap;
- break;
- case rtfMacCharSet:
- charMap = macCharMap;
- break;
- case rtfPcCharSet:
- charMap = pcCharMap;
- break;
- case rtfPcaCharSet:
- charMap = pcaCharMap;
- break;
- }
- }
-
-
- /*
- This function notices destinations that should be ignored
- and skips to their ends. This keeps, for instance, picture
- data from being considered as plain text.
- */
-
- static void Destination ()
- {
- switch (rtfMinor)
- {
- case rtfPict:
- case rtfFNContSep:
- case rtfFNContNotice:
- case rtfInfo:
- case rtfIndexRange:
- case rtfITitle:
- case rtfISubject:
- case rtfIAuthor:
- case rtfIOperator:
- case rtfIKeywords:
- case rtfIComment:
- case rtfIVersion:
- case rtfIDoccomm:
- RTFSkipGroup ();
- break;
- }
- }
-
-
- static void SpecialChar ()
- {
- switch (rtfMinor)
- {
- case rtfPage:
- case rtfSect:
- case rtfRow:
- case rtfLine:
- case rtfPar:
- PutChar ('\n');
- break;
- case rtfCell:
- PutChar (' '); /* make sure cells are separated */
- break;
- case rtfNoBrkSpace:
- PutChar (' ');
- break;
- case rtfTab:
- PutChar ('\t');
- break;
- case rtfNoBrkHyphen:
- PutChar ('-');
- break;
- }
- }
-
-
- /*
- Eventually this should keep track of the destination of the
- current state and only write text when in the initial state.
- */
-
- PutChar (c)
- int c;
- {
- CharMap *cmp;
- char *p = "X";
-
- if (c < 128)
- putchar (c);
- else
- {
- for (cmp = charMap; cmp->charStr != (char *) NULL; cmp++)
- {
- if (c == cmp->charVal)
- {
- p = cmp->charStr;
- break;
- }
- }
- fputs (p, stdout);
- }
- }
-
-
- CharMap ansiCharMap [] =
- {
- 0, NULL
- };
-
-
- CharMap macCharMap [] =
- {
- 0xa0, "+", /* dagger */
- 0xa1, "deg.", /* degree */
- 0xa2, "cents", /* cent */
- 0xa5, "o", /* bullet */
- 0xa7, "B", /* German B? */
- 0xa8, "reg.", /* registered */
- 0xa9, "(c)", /* copyright */
- 0xaa, "(TM)", /* trademark */
- 0xab, "'", /* acute accent */
- 0xad, "!=", /* not equal */
- 0xae, "AE", /* joined A-E */
- 0xb1, "+/-", /* plus or minus */
- 0xb2, "<=", /* less than or equal */
- 0xb3, ">=", /* greater than or equal */
- 0xb5, "u", /* micro */
- 0xb6, "d", /* delta */
- 0xbe, "ae", /* joined a-e */
- 0xc5, "~", /* approximately */
- 0xc7, "<<", /* alternate quote */
- 0xc8, ">>", /* alternate end-quote*/
- 0xc9, "...", /* ellipsis */
- 0xca, " ", /* unbreakable space */
- 0xd0, "-", /* short dash */
- 0xd1, "--", /* long dash */
- 0xd2, "\"", /* left curly double quote */
- 0xd3, "\"", /* right curly double quote */
- 0xd4, "`", /* left curly single quote */
- 0xd5, "'", /* right curly single quote */
- 0xd6, "/", /* divide */
- 0, NULL
- };
-
-
- CharMap pcCharMap [] =
- {
- 0, NULL
- };
-
-
- CharMap pcaCharMap [] =
- {
- 0, NULL
- };
-