home *** CD-ROM | disk | FTP | other *** search
- _TAWK, A Simple Interpreter in C++_
- by Bruce Eckel
-
-
- [LISTING ONE]
-
-
- // FIELD.HXX: used by csascii class to build a single field.
- // Fields are collected by csascii to create a record.
- // by Bruce Eckel,
- #include <stream.hxx>
-
- class field { // one field in a comma-separated ASCII record
- istream * input; // where to get the data
- char * data;
- int length, fsize;
- int end_of_file; // flag to indicate the end of file happened
- void getfield(); // recursive function to read in a field;
- // treats data, length & input as globals
- int infield; // flag used by getfield() to determine whether
- // it's inside a quoted field
- public:
- field(istream & instream);
- ~field();
- friend ostream& operator<<(ostream &s, field & f) {
- s << f.data;
- return s;
- }
- int eof() { return end_of_file; } // to check for end
- int size() { return fsize;}
- int last_length() {return length; }
- char * string() { return data; }
- };
-
-
- [LISTING TWO]
-
- // FIELD.CXX: definitions for class field
- // A "recursive descent" scanning scheme is used because field
- // length is always unknown.
- // by Bruce Eckel
- #include "field.hxx"
-
- field::field(istream & instream) {
- input = &instream;
- length = 0;
- end_of_file = 0; // set flag to say "we're not at the end"
- infield = 0; // set flag to say "we're not inside a field"
- data = (char *)0; // to show no memory has been allocated
- getfield(); // recursively get characters until end of field
- }
-
- field::~field() {
- delete data; // if no memory has been allocated,
- // data = (char *)0 so this will have no effect.
- }
-
- // A Comma-separated ASCII field is contained in quotes to allow
- // commas within the field; these quotes must be stripped out
- void field::getfield() {
- char c;
- // This happens when DEscending:
- if((input->get(c)).eof() ) {
- end_of_file++; // just say we reached the end...
- return;
- }
- else // watch out for the Unix vs. DOS LF/CR problem here:
- if (((c != ',') || infield) && (c != '\n')) {
- if ( (c != '"') && (c != '\r')) // watch for quotes or CR
- length++; // no quotes -- count this character
- else {
- if ( c == '"')
- infield = !infield; // if we weren't inside a field
- // and a quote was encountered, we are now inside
- // a field. If we were inside a field and a quote
- // was found, we're out of the field.
- c = 0; // a quote or CR; mark it so it isn't included
- }
- getfield(); // recursively get characters in field
- // after returning from function call, we jump past
- // the following "else" part to finish the recursion
- }
- else { // This happens once, when the terminator is found:
- fsize = length; // remember how long the string is
- data = new char[length + 1]; // space for null terminator
- data[length] = '\0'; // highest index is "length"
- // when you allocate an array of length + 1
- length--; // notice we don't insert the delimiter
- // Now the first "if" statement evaluates to TRUE and
- // the function rises back up.
- return;
- }
- // This happens when Ascending:
- if ( c ) // if it wasn't a quote or CR,
- data[length--] = c; // put chars in as we rise back up...
- }
-
-
- [LISTING THREE]
-
- // CSASCII.HXX: class to manipulate comma-separated ASCII
- // database files.
- //by Bruce Eckel
- #include <stream.hxx>
- #include "field.hxx"
-
- class csascii { // manipulates comma-separated ascii files,
- // generated by most database management systems (generated and
- // used by the BASIC programming language). Each field
- // is separated by a comma; records are separated by newlines.
- int fieldcount;
- field ** data; // an array to hold the entire record
- istream * datafile; // file with comma separated ASCII input
- int readrecord(); // private function to read a record
- public:
- csascii( char * filename ); // Open file, get first record
- ~csascii(); // destructor
- int next(); // get next record, return 0 when EOF
- field & operator[](int index); // select a field
- int number_of_fields() { return fieldcount; }
- };
-
-
-
- [LISTING FOUR]
-
- // CSASCII.CXX: function definitions for comma-separated
- // ascii database manipulation class
- // by Bruce Eckel,
- #include "csascii.hxx"
-
- int csascii::readrecord() {
- for (int fieldnum = 0; fieldnum < fieldcount; fieldnum++ ) {
- data[fieldnum] = new field(*datafile);
- if (data[fieldnum]->eof()) return 0;
- }
- return 1;
- }
-
- csascii::csascii( char * filename ) {
- char c;
- fieldcount = 0;
- int quote = 0;
- // first, determine the number of fields in a record:
- {
- // See text for dangers of opening files this way:
- istream infile(new filebuf->open(filename, input));
- while(infile.get(c), c != '\n') {
- // keep track of being inside a quoted string:
- if (c == '"') quote = !quote;
- // fields are delimited by unquoted commas:
- if ( c == ',' && !quote)
- fieldcount++;
- }
- } // infile goes out of scope; file closed
- fieldcount++; // last field terminated by newline, not comma
- // an array of field pointers:
- data = new field * [ fieldcount ];
- // re-open at start; dynamically allocate so it isn't scoped:
- datafile = new istream(new filebuf->open(filename, input));
- readrecord();
- }
-
- csascii::~csascii() {
- delete data;
- delete datafile; // calls istream destructor to close file
- }
-
- int csascii::next() {
- for (int i = 0; i < fieldcount; i++ )
- delete data[i]; // free all the data storage
- return readrecord(); // 0 when end of file
- }
-
- field & csascii::operator[](int index) {
- if (index >= fieldcount) {
- cerr << "index too large for number of fields in record\n";
- exit(1);
- }
- return *(data[index]);
- }
-
-
- [LISTING FIVE]
-
- // LOOKUP.CXX: simple use of csascii to find name in a database
- // by Bruce Eckel,
- #include "csascii.hxx"
- #include <string.h>
-
- main(int argc, char ** argv) {
- if (argc < 2) {
- cerr << "usage: lookup lastname\n";
- exit(1);
- }
- // This puts the database file in the root directory:
- csascii file("\\ppquick.asc"); // create object & open file
- int found = 0; // indicates one record was found
- do {
- if (strcmp(file[0].string(),argv[1]) == 0) {
- found++; // found one. File is sorted, so if we stop
- // finding them, quit instead of wasting time.
- cout << chr(27) << "[2J"; // ANSI clear screen
- for (int i = 0; i < file.number_of_fields(); i++)
- cout << file[i] << "\n";
- cout << chr(27) << "[7m" << "press any key" <<
- chr(27) << "[0m";
- if( getch() == 27) break;
- } else if (found) exit(0); // quit if that was the last
- } while (file.next());
- }
-
-
- [LISTING SIX]
-
-
- // PARSE.HXX: class to parse a tawk script file. Creates
- // a structure which can be used at run-time to "execute"
- // the tawk script.
- // by Bruce Eckel,
- #include <stream.hxx>
-
- // types of tokens the scanner can find:
- enum tokentype {
- fieldnumber, string, if_, else_, endif_, phase_change
- };
-
- // preamble and conclusion of the tawk script are only executed
- // once, while main is executed once for every data record
- enum phase { preamble, tmain, conclusion};
-
- class token {
- tokentype ttype;
- union { // an "anonymous union"
- int fieldnum; // if type is a fieldnumber
- unsigned char * literal; // if type is a string
- };
- int if_level; // if this is an if_, then_, or else_
- // private functions:
- void get_token(); // recursive descent scanner
- // Functions to help in scanning:
- void getnext(char & c); // used by get_token();
- unsigned char get_value(char delimiter, char * msg);
- void dumpline(); // for @! comments
- void error(char * msg = "", char * msg2 = "");
- public:
- token(istream & input);
- ~token();
- friend ostream & operator<<(ostream &s, token &t);
- int field_number() { return fieldnum; }
- int token_type() { return ttype; }
- int nesting_level() { return if_level;}
- };
-
- // The following is called a "container class," since its sole
- // purpose is to hold a list of objects (tokens, in this case):
- class parse_array {
- token ** tokenarray; // an array of token pointers
- istream * parse_stream;
- int token_count;
- int end; // the size of the array
- phase p_section; // of the program (preamble, etc.)
- void build_array(); // another recursive function
- public:
- parse_array(istream & input);
- ~parse_array();
- int size() { return end; } // how big is it?
- token & operator[](int index); // select a token
- phase section() { return p_section; }
- };
-
-
- [LISTING SEVEN]
-
- // PARSE.CXX: class parse function definitions
- // by Bruce Eckel,
- #include "csascii.hxx"
- #include "parse.hxx"
- #include <ctype.h>
- #include <stdlib.h>
-
- // The following are "file static," which means no one outside
- // this file can know about them. This is the meaning when a
- // global variable is declared "static."
- static istream * tokenstream;
- static int length; // to remember size of string
- static int line_number = 1; // line counting for errors
- static int if_counter = 0; // monitors "if" statement nesting
- static phase program_section = preamble; // ... until @main
- static int end_of_file = 0; // zero means not end of file
-
- token::token(istream & input) {
- // initialize values and start the descent
- tokenstream = &input;
- length = 0;
- get_token(); // recursively get characters to end of token
- }
-
- token::~token() { // delete heap if any has been allocated:
- if (ttype == string)
- delete literal;
- }
-
- void token::error(char * msg, char * msg2) {
- cerr << "token error on line " << line_number << ": " <<
- msg << " " << msg2 << "\n";
- exit(1);
- }
-
- ostream & operator<<(ostream &s, token &t) {
- switch (t.ttype) {
- case string:
- s << (char *)t.literal;
- break;
- case fieldnumber: // only for testing
- s << " fieldnumber: " << t.fieldnum << "\n";
- }
- return s;
- }
-
- // Get a character from the tokenstream, checking for
- // end-of-file and newlines
- void token::getnext(char & c) {
- if(end_of_file)
- error("attempt to read after @end statement\n",
- "missing @conclusion ?");
- if((tokenstream->get(c)).eof() )
- error("@end statement missing");
- if (c == '\n')
- line_number++; // keep track of the line count
- }
-
- // See text for description of tokens
- void token::get_token() {
- char c;
- // This happens when DEscending:
- getnext(c);
- if ( c == '@') {
- if (length == 0) { // length 0 means start of token
- getnext(c);
- switch(c) {
- case '!': // comment line
- dumpline(); // dump the comment
- get_token(); // get a real token
- break;
- case 'p' : case 'P' : // preamble statement
- if ( program_section != preamble )
- error("only one preamble allowed");
- dumpline(); // just for looks, ignore it
- get_token(); // get a real token
- break;
- case 'm' : case 'M' : // start of main loop
- dumpline(); // toss rest of line
- program_section = tmain;
- ttype = phase_change;
- return; // very simple token
- case 'c' : case 'C' : // start conclusion
- dumpline();
- program_section = conclusion;
- ttype = phase_change;
- return; // very simple token
- case 'e' : case 'E': // end statement
- end_of_file++; // set flag
- ttype = fieldnumber; // so destructor doesn't
- // delete free store for this token.
- if (if_counter)
- error("unclosed 'if' statement(s)");
- return;
- case '(' :
- if ( program_section == preamble ||
- program_section == conclusion )
- error("@() not allowed in preamble or conclusion");
- fieldnum = get_value(')',"@()");
- ttype = fieldnumber;
- // This is a complete token, so quit
- return;
- case '<' :
- c = get_value('>',"@<>");
- length++;
- get_token(); // get more...
- break;
- case '?' : // beginning of an "if" statement
- if ( program_section == preamble ||
- program_section == conclusion )
- error("@? not allowed in preamble or conclusion");
- fieldnum = get_value('@',"@?@");
- ttype = if_;
- getnext(c); // just eat the colon
- if(c != ':')
- error("@? must be followed by @: (then)");
- if_level = ++if_counter; // for nesting
- return;
- case '~' : // the "else" part of an "if" statement
- ttype = else_;
- if_level = if_counter;
- return;
- case '.' : // "endif" terminator of an "if" statement
- ttype = endif_;
- if_level = if_counter--;
- if(if_counter < 0)
- error("incorrect nesting of if-then-else clauses");
- return;
- case '@' : // two '@' in a row mean print an '@'
- length++; // just leave '@' as the value of c
- get_token();
- break;
- default:
- error("'@' must be followed by:",
- "'(', '<', '?',':','~','.','p','m','c' or '@'");
- }
- } else { // an '@' in the middle of a string; terminate
- // the string. Putback() is part of the stream class.
- // It is only safe to put one character back on the input
- tokenstream->putback(c); // to be used by the next token
- // allocate space, put the null in and return up the stack
- literal = new unsigned char[length + 1]; // space for '\0'
- literal[length--] = '\0'; // string delimiter
- ttype = string; // what kind of token this is
- return; // back up the stack
- }
- } else { // not an '@', must be plain text
- length++;
- get_token();
- }
- // This occurs on the "tail" of the recursion:
- literal[length--] = c; // put chars in as we rise back up...
- }
-
- // This function is used by get_token when it encounters a @(
- // or a @< to get a number until it finds "delimiter."
- // If an error occurs, msg is used to notify the user what
- // kind of statement it is.
- unsigned char token::get_value(char delimiter, char * msg) {
- char c;
- char buf[5];
- int i = 0;
- while(getnext(c), c != delimiter) {
- if (!isdigit(c))
- error("must use only digits inside", msg);
- buf[i++] = c;
- }
- buf[i] = 0;
- return atoi(buf);
- }
-
- void token::dumpline() { // called when '@!' encountered
- char c;
- while(getnext(c), c != '\n')
- ; // just eat characters until newline
- }
-
- // Since there's no way to know how big a parse_array is
- // going to be until the entire tawkfile has been tokenized,
- // the recursive approach is again used:
-
- parse_array::parse_array(istream & input) {
- parse_stream = &input;
- token_count = 0;
- p_section = program_section; // so we know at run-time
- build_array();
- }
-
- void parse_array::build_array() {
- token * tk = new token(*parse_stream);
- if( ! end_of_file && tk->token_type() != phase_change) {
- // normal token, not end of file or phase change:
- token_count++;
- // recursively get tokens until eof or phase change:
- build_array();
- } else { // end of file or phase change
- // only done once per object:
- // allocate memory and return up the stack
- tokenarray = new token * [end = token_count];
- if(token_count) token_count--; // only if non-zero
- return;
- }
- tokenarray[token_count--] = tk; // performed on the "tail"
- }
-
-
- parse_array::~parse_array() {
- for (int i = 0; i < end; i++)
- delete tokenarray[i];
- delete tokenarray;
- }
-
- token & parse_array::operator[](int index) {
- if ( index >= end ) {
- cerr << "parse_array error: index " << index
- << " out of bounds\n";
- exit(1);
- }
- return *tokenarray[index];
- }
-
-
- [LISTING EIGHT]
-
- // TAWK.CXX: parses a tawk script and reads an ascii file;
- // generates results according to the tawk script.
- // by Bruce Eckel,
- #include "csascii.hxx"
- #include "parse.hxx"
-
- main (int argc, char * argv[]) {
- int screen = 0; // flag set true if screen output desired
- if (argc < 3) {
- cerr << "usage: tawk tawkfile datafile\n" <<
- "trailing -s pages output to screen";
- exit(1);
- }
- if (argc == 4) {
- if (argv[3][0] != '-') {
- cerr << "must use '-' before trailing flag\n";
- exit(1);
- } else
- if (argv[3][1] != 's') {
- cerr << "'s' is only trailing flag allowed";
- exit(1);
- } else
- screen++; // set screen output flag true
- }
- istream tawkfile(new filebuf->open(argv[1], input));
- parse_array Apreamble(tawkfile); // the @preamble
- parse_array Amain(tawkfile); // the @main section
- parse_array Aconclusion(tawkfile); // the @conclusion
- csascii datafile(argv[2]); // make a comma-separated ASCII
- // object from the second arg
- // ------ @preamble ------
- for (int i = 0; i < Apreamble.size(); i++)
- cout << Apreamble[i]; // preamble can only contain strings
- if(screen) {
- // ANSI reverse video sequence:
- cout << chr(27) << "[7m" << "press any key" <<
- chr(27) << "[0m";
- getch();
- }
- // ------ The Central Loop (@main) -------
- do { // for each record in the data file
- if(screen) cout << chr(27) << "[2J"; // ANSI clear screen
- for(int i = 0; i < Amain.size(); i++) {
- switch(Amain[i].token_type()) {
- case fieldnumber:
- cout << datafile[Amain[i].field_number()];
- break;
- case string:
- cout << Amain[i];
- break;
- case if_:
- int fn = Amain[i].field_number();
- if (datafile[fn].size() == 0) { // conditional false
- int level = Amain[i].nesting_level();
- // find the "else" statement on the same level:
- while ( !(Amain[i].token_type() == else_
- && Amain[i].nesting_level() == level))
- i++;
- } // conditional true -- just continue
- break;
- case else_: // an "if" conditional was true so skip
- // all the statements in the "else" clause
- int level = Amain[i].nesting_level();
- // find the "endif" statement on the same level:
- while ( !(Amain[i].token_type() == endif_
- && Amain[i].nesting_level() == level))
- i++;
- break;
- case endif_: // after performing the "else" clause
- break; // ignore it; only used to find the end
- // of the conditional when "if" is true.
- default: // should never happen (caught in parsing)
- cerr << "unknown statement encountered at run-time\n";
- exit(1);
- }
- }
- if(screen) {
- cout << chr(27) << "[7m" <<
- "press a key (ESC quits)" << chr(27) << "[0m";
- if( getch() == 27) break;
- }
- } while (datafile.next()); // matches do { ...
- // ------ @conclusion ------
- for ( i = 0; i < Aconclusion.size(); i++)
- cout << Aconclusion[i]; //conclusion contains only strings
- }
-
-
-
- [LISTING NINE]
-
- # makefile for tawk.exe & lookup.exe
- # Zortech C++:
- CPP = ztc
- # Glockenspiel C++ w/ MSC 4:
- #CPP = ccxx !4
-
- all: tawk.exe lookup.exe
-
- tawk.exe : tawk.obj parse.obj csascii.obj field.obj
- $(CPP) tawk.obj parse.obj csascii.obj field.obj
-
- lookup.exe : lookup.cxx csascii.obj field.obj
- $(CPP) lookup.cxx csascii.obj field.obj
-
- tawk.obj : tawk.cxx parse.hxx csascii.hxx field.hxx
- $(CPP) -c tawk.cxx
-
- parse.obj : parse.cxx parse.hxx
- $(CPP) -c parse.cxx
-
- csascii.obj : csascii.cxx csascii.hxx field.hxx
- $(CPP) -c csascii.cxx
-
- field.obj : field.cxx field.hxx
- $(CPP) -c field.cxx
-
-
-
- [LISTING TEN]
-
- @! REFORM.TWK
- @! A tawk script to reformat a comma-separated ASCII file
- @! with 6 fields. This creates a new CS-ASCII file with
- @! fields 4 and 5 combined.
- @main
- "@(0)","@(1)","@(2)","@(3)","@(4)@?4@: @~@.@(5)"
- @conclusion
- @end
-
-
-
- [LISTING ELEVEN]
-
-
- @! WALLET.TWK
- @! Tawkfile to create a tiny phone listing for a wallet
- @! on a Hewlett-Packard Laserjet-compatible printer
- @! From a comma-separated ASCII file generated by a DBMS
- @preamble
- @<27>&l5C@! approximately 10 lines per inch
- @<27>(s16.66H@! small typeface, built into Laserjet
- @main
- @! last, first, (area code) phone1
- @(0),@(1)(@(2))@?3@:@(3)
- @ phone2, if it exists
- @?4@:@(4)
- @~@.@~@?4@:@(4)
- @~
- @.@.@conclusion
- @<27>E @! Reset the Laserjet
- @end
-
- [EXAMPLE 1]
-
- class tiny {
- // private stuff here (this is a comment)
- int i;
- public: // public stuff here:
- print() { // an "in-line" function
- printf("i = %d\n",i);
- }
- tiny(int j); // constructors have the class name
- ~tiny() {} // destructors use a tilde
- }; // classes end with a brace and a semicolon
-
- tiny::tiny(int j) { // non inline definition
- i = j;
- }
-
- main() {
- tiny A(2); // implicit constructor call
- // A.i = 30; // error! private member
- A.print(); // calling a member function
- // implicit destructor call at end of scope
- }
-
-
-
-
-
- [EXAMPLE 2]
-
-
- #include <stream.hxx> // cout automatically defined
- main() {
- cout << "Hello, world!\n" << "I am "
- << 6 << "today!\n";
- }
-
-
-
-
- [EXAMPLE 3]
-
- filebuf f1;
- if (f1.open(argv[1],input) == 0) {
- cout << "cannot open " << argv[1] << "\n";
- exit(1);
- }
- istream infile(&f1);
-
-
-
- [EXAMPLE 4]
-
-
- "Ball","Mike","Oregon Software C++ Compiler"
- "Bright","Walter","Zortech C++ Compiler"
- "Carolan","John","Glockenspiel C++ Translator"
- "Stroustrup","Bjarne","AT&T, C++ Creator"
- "Tiemann","Michael","Free Software Foundation C++ Compiler"
-
-