home *** CD-ROM | disk | FTP | other *** search
/ Source Code 1992 March / Source_Code_CD-ROM_Walnut_Creek_March_1992.iso / usenet / altsrcs / 1 / 1425 / regular.c < prev    next >
Encoding:
C/C++ Source or Header  |  1990-12-28  |  32.5 KB  |  1,230 lines

  1. /*
  2. %  A regular expression is zero or more branches, separated by `|'. It
  3. %  matches anything that matches one of the branches.
  4. %  
  5. %  A branch is zero or more pieces, concatenated. It matches a match for
  6. %  the first, followed by a match for the second, etc.
  7. %  
  8. %  A piece is an atom possibly followed by `*', `+', or `?'. An atom
  9. %  followed by `*' matches a sequence of 0 or more matches of the atom. An
  10. %  atom followed by `+' matches a sequence of 1 or more matches of the
  11. %  atom. An atom followed by `?' matches a match of the atom, or the null
  12. %  pattern.
  13. %  
  14. %  An atom is a regular expression in parentheses (matching a match for
  15. %  the regular expression), a range (see below), `.' (matching any single
  16. %  character), `^' (matching the null pattern at the beginning of the input
  17. %  pattern), `$' (matching the null pattern at the end of the input pattern),
  18. %  a `\' fol- lowed by a single character (matching that character), or a
  19. %  single character with no other significance (matching that character).
  20. %  
  21. %  A range is a sequence of characters enclosed in `[]'. It normally
  22. %  matches any single character from the sequence. If the sequence begins
  23. %  with `^', it matches any single charac- ter not from the rest of the
  24. %  sequence. If two characters in the sequence are separated by `-', this
  25. %  is shorthand for the full list of ASCII characters between them (e.g.
  26. %  `[0-9]' matches any decimal digit). To include a literal `]' in the
  27. %  sequence, make it the first character (following a possible `^').  To
  28. %  include a literal `-', make it the first or last character.
  29. %  
  30. %  If a regular expression could match two different parts of the input
  31. %  pattern, it will match the one which begins earliest. If both begin in
  32. %  the same placebut match dif- ferent lengths, or match the same length
  33. %  in different ways, life gets messier, as follows.
  34. %  
  35. %  In general, the possibilities in a list of branches are con- sidered in
  36. %  left-to-right order, the possibilities for `*', `+', and `?' are
  37. %  considered longest-first, nested constructs are considered from the
  38. %  outermost in, and concatenated con- structs are considered
  39. %  leftmost-first. The match that will be chosen is the one that uses the
  40. %  earliest possibility in the first choice that has to be made. If there
  41. %  is more than one choice, the next will be made in the same manner
  42. %  (earli- est possibility) subject to the decision on the first choice.
  43. %  And so forth.
  44. %  
  45. %  For example, `(ab|a)b*c' could match `abc' in one of two ways. The
  46. %  first choice is between `ab' and `a'; since `ab' is earlier, and does
  47. %  lead to a successful overall match, it is chosen. Since the `b' is
  48. %  already spoken for, the `b*' must match its last possibility-the empty
  49. %  pattern-since it must respect the earlier choice.
  50. %  
  51. %  In the particular case where no `|'s are present and there is only one
  52. %  `*', `+', or `?', the net effect is that the longest possible match
  53. %  will be chosen. So `ab*', presented with `xabbbby', will match `abbbb'.
  54. %  Note that if `ab*' is tried against `xabyabbbz', it will match `ab'
  55. %  just after `x', due to the begins-earliest rule. (In effect, the deci-
  56. %  sion on where to start the match is the first choice to be made, hence
  57. %  subsequent choices must respect it even if this leads them to
  58. %  less-preferred alternatives.)
  59. %  
  60. %  CompileRegularExpression returns NULL for a failure, where failures are
  61. %  syntax errors, exceeding implementation limits, or applying `+' or `*'
  62. %  to a possibly-null operand.
  63. %
  64. %  This is essentially the same routine written and copywrited by Henry 
  65. %  Spencer, University of Toronto.  I made minor programming changes but 
  66. %  major variable name changes to improve readibility.
  67. %
  68. %
  69. */
  70. #include <stdio.h>
  71. #include <ctype.h>
  72. #include <malloc.h>
  73. #include "regular.h"
  74.  
  75. char
  76.   *code,
  77.   **subpattern_end,
  78.   *p,
  79.   start_code,
  80.   *start_pattern,
  81.   **subpattern;
  82.  
  83. extern char
  84.   *strchr();
  85.  
  86. static char    
  87.   *token;  
  88.  
  89. static int      
  90.   number_parenthesis;  
  91.  
  92. static long     
  93.   code_size;  
  94.  
  95. /*
  96. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  97. %                                                                             %
  98. %                                                                             %
  99. %                                                                             %
  100. %   A t o m                                                                   %
  101. %                                                                             %
  102. %                                                                             %
  103. %                                                                             %
  104. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  105. %
  106. %
  107. */
  108. static char *Atom(flagp)
  109. int
  110.   *flagp;
  111. {
  112.   char
  113.     *Node(),
  114.     *Regular();
  115.  
  116.   int
  117.     flags;
  118.  
  119.   register char
  120.     *status;
  121.  
  122.   void
  123.     EmitCode();
  124.  
  125.   *flagp=WorstCase;
  126.   switch(*token++)
  127.   {
  128.     case '^':
  129.     {
  130.       status=Node(MatchBeginningOfLine);
  131.       break;
  132.     }
  133.     case '$':
  134.     {
  135.       status=Node(MatchEndOfProgramOfLine);
  136.       break;
  137.     }
  138.     case '.':
  139.     {
  140.       status=Node(MatchAnyCharacter);
  141.       *flagp|=NonNull | Simple;
  142.       break;
  143.     }
  144.     case '[':
  145.     {
  146.       register int
  147.         class,
  148.         class_end;
  149.  
  150.       if (*token != '^')
  151.         status=Node(MatchAnyCharacterOf);
  152.       else
  153.         {
  154.           /*
  155.             Complement of range.
  156.           */
  157.           status=Node(MatchAnyCharacterBut);
  158.           token++;
  159.         }
  160.       if ((*token == ']') || (*token == '-'))
  161.         EmitCode(*token++);
  162.       while ((*token != '\0') && (*token != ']'))
  163.       {
  164.         if (*token != '-')
  165.           EmitCode(*token++);
  166.          else
  167.           {
  168.             token++;
  169.             if ((*token == ']') || (*token == '\0'))
  170.               EmitCode('-');
  171.             else
  172.               {
  173.                 class=((int)*(unsigned char *)(token-2))+1;
  174.                 class_end=((int)*(unsigned char *)(token));
  175.                 if (class > class_end+1)
  176.                   Fail("invalid [] range");
  177.                 for(; class <= class_end; class++)
  178.                   EmitCode(class);
  179.                 token++;
  180.               }
  181.           }
  182.       }
  183.       EmitCode('\0');
  184.       if (*token != ']')
  185.         Fail("unmatched []");
  186.       token++;
  187.       *flagp|=NonNull | Simple;
  188.       break;
  189.     }
  190.     case '(':
  191.     {
  192.       status=Regular(1,&flags);
  193.       if (status == NULL)
  194.         return(NULL);
  195.       *flagp|=flags & (NonNull | SpecialStart);
  196.       break;
  197.     }
  198.     case '\0':
  199.     case '|':
  200.     case ')':
  201.     {
  202.       Fail("internal urp");
  203.       break;
  204.     }
  205.     case '?':
  206.     case '+':
  207.     case '*':
  208.     {
  209.       Fail("?+* follows nothing");
  210.       break;
  211.     }
  212.     case '\\':
  213.     {
  214.       if (*token == '\0')
  215.         Fail("trailing \\");
  216.       status=Node(MatchExactly);
  217.       EmitCode(*token++);
  218.       EmitCode('\0');
  219.       *flagp|=NonNull | Simple;
  220.       break;
  221.     }
  222.     default:
  223.     {
  224.       register char
  225.         ender;
  226.  
  227.       register int
  228.         length;
  229.  
  230.       token--;
  231.       length=strcspn(token,Meta);
  232.       if (length <= 0)
  233.         Fail("internal disaster");
  234.       ender=(*(token+length));
  235.       if (length > 1 && MultipleMatches(ender))
  236.         length--;
  237.       *flagp|=NonNull;
  238.       if (length == 1)
  239.         *flagp|=Simple;
  240.       status=Node(MatchExactly);
  241.       while (length > 0)
  242.       {
  243.         EmitCode(*token++);
  244.         length--;
  245.       }
  246.       EmitCode('\0');
  247.       break;
  248.     }
  249.   }
  250.   return(status);
  251. }
  252.  
  253. /*
  254. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  255. %                                                                             %
  256. %                                                                             %
  257. %                                                                             %
  258. %   B r a n c h                                                               %
  259. %                                                                             %
  260. %                                                                             %
  261. %                                                                             %
  262. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  263. %
  264. %  Function Branch Implements the | operator.
  265. %
  266. %
  267. */
  268. static char *Branch(flagp)
  269. int
  270.   *flagp;
  271. {
  272.   char
  273.     *Node(),
  274.     *Piece();
  275.  
  276.   int
  277.     flags;
  278.  
  279.   register char
  280.     *chain,
  281.     *latest,
  282.     *status;
  283.  
  284.   void
  285.     Tail();
  286.  
  287.   *flagp=WorstCase;
  288.   status=Node(MatchThisOrNext);
  289.   chain=NULL;
  290.   while ((*token != '\0') && (*token != '|') && (*token != ')'))
  291.   {
  292.     latest=Piece(&flags);
  293.     if (latest == NULL)
  294.       return(NULL);
  295.     *flagp|=flags & NonNull;
  296.     if (chain == NULL)
  297.       *flagp|=flags & SpecialStart;
  298.     else
  299.       Tail(chain,latest);
  300.     chain=latest;
  301.   }
  302.   if (chain == NULL)
  303.    (void) Node(MatchEmptyString);
  304.   return(status);
  305. }
  306.  
  307. /*
  308. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  309. %                                                                             %
  310. %                                                                             %
  311. %                                                                             %
  312. %   E m i t C o d e                                                           %
  313. %                                                                             %
  314. %                                                                             %
  315. %                                                                             %
  316. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  317. %
  318. %
  319. */
  320. static void EmitCode(opcode)
  321. char
  322.   opcode;
  323. {
  324.   if (code != &start_code)
  325.     *code++=opcode;
  326.   else
  327.     code_size++;
  328. }
  329.  
  330. /*
  331. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  332. %                                                                             %
  333. %                                                                             %
  334. %                                                                             %
  335. %   I n s e r t                                                               %
  336. %                                                                             %
  337. %                                                                             %
  338. %                                                                             %
  339. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  340. %
  341. %  Function Insert inserts an operator in front of an already-emitted operand.
  342. %
  343. */
  344. static void Insert(opcode,operand)
  345. char
  346.   opcode,
  347.   *operand;
  348. {
  349.   register char
  350.     *p,
  351.     *place,
  352.     *q;
  353.  
  354.   if (code == &start_code)
  355.     {
  356.       code_size+=3;
  357.       return;
  358.     }
  359.   p=code;
  360.   code+=3;
  361.   q=code;
  362.   while (p > operand)
  363.     *--q=(*--p);
  364.   place=operand;
  365.   *place++=opcode;
  366.   *place++='\0';
  367.   *place++='\0';
  368. }
  369.  
  370. /*
  371. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  372. %                                                                             %
  373. %                                                                             %
  374. %                                                                             %
  375. %   M a t c h                                                                 %
  376. %                                                                             %
  377. %                                                                             %
  378. %                                                                             %
  379. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  380. %
  381. %
  382. */
  383. static int Match(regular_expression)
  384. char
  385.   *regular_expression;
  386. {
  387.   char
  388.     *next_token,
  389.     *NextToken();
  390.  
  391.   register char
  392.     *scan;
  393.  
  394.   scan=regular_expression;
  395.   while (scan != NULL)
  396.   {
  397.     next_token=NextToken(scan);
  398.     switch(OpCode(scan))
  399.     {
  400.       case MatchBeginningOfLine:
  401.       {
  402.         if (p != start_pattern)
  403.           return(0);
  404.         break;
  405.       }
  406.       case MatchEndOfProgramOfLine:
  407.       {
  408.         if (*p != '\0')
  409.          return(0);
  410.         break;
  411.       }
  412.       case MatchAnyCharacter:
  413.       {
  414.         if (*p == '\0')
  415.           return(0);
  416.         p++;
  417.         break;
  418.       }
  419.       case MatchExactly:
  420.       {
  421.         register char
  422.           *operand;
  423.  
  424.         register int
  425.           length;
  426.  
  427.         operand=Operand(scan);
  428.         /*
  429.           Inline the first character for speed.
  430.         */
  431.         if (*operand != *p)
  432.           return(0);
  433.         length=strlen(operand);
  434.         if ((length > 1) && (strncmp(operand,p,length) != 0))
  435.           return(0);
  436.         p+=length;
  437.         break;
  438.       }
  439.       case MatchAnyCharacterOf:
  440.       {
  441.         if ((*p == '\0' || strchr(Operand(scan),*p) == NULL))
  442.           return(0);
  443.         p++;
  444.         break;
  445.       }
  446.       case MatchAnyCharacterBut:
  447.       {
  448.         if ((*p == '\0') || (strchr(Operand(scan),*p) != NULL))
  449.           return(0);
  450.         p++;
  451.         break;
  452.       }
  453.       case MatchEmptyString:
  454.         break;
  455.       case Back:
  456.         break;
  457.       case Open+1:
  458.       case Open+2:
  459.       case Open+3:
  460.       case Open+4:
  461.       case Open+5:
  462.       case Open+6:
  463.       case Open+7:
  464.       case Open+8:
  465.       case Open+9:
  466.       {
  467.         register char
  468.           *save;
  469.  
  470.         register int
  471.           no;
  472.  
  473.         no=OpCode(scan)-Open;
  474.         save=p;
  475.         if (!Match(next_token))
  476.           return(0);
  477.         else
  478.           {
  479.             /*
  480.               Don't set subpattern if some later invocation of the same
  481.               parentheses already has.
  482.             */
  483.             if (subpattern[no] == NULL)
  484.               subpattern[no]=save;
  485.             return(1);
  486.           }
  487.         break;
  488.       }
  489.       case Close+1:
  490.       case Close+2:
  491.       case Close+3:
  492.       case Close+4:
  493.       case Close+5:
  494.       case Close+6:
  495.       case Close+7:
  496.       case Close+8:
  497.       case Close+9:
  498.       {
  499.         register char
  500.           *save;
  501.  
  502.         register int
  503.           no;
  504.  
  505.         no=OpCode(scan)-Close;
  506.         save=p;
  507.         if (!Match(next_token))
  508.            return(0);
  509.         else
  510.           {
  511.             /*
  512.               Don't set subpattern_end if some later invocation of the same 
  513.               parentheses already has.
  514.             */
  515.             if (subpattern_end[no] == NULL)
  516.               subpattern_end[no]=save;
  517.             return(1);
  518.           }
  519.         break;
  520.       }
  521.       case MatchThisOrNext:
  522.       {
  523.         register char
  524.           *save;
  525.  
  526.         if (OpCode(next_token) != MatchThisOrNext)
  527.           next_token=Operand(scan);  
  528.         else
  529.           {
  530.             do
  531.             {
  532.               save=p;
  533.               if (Match(Operand(scan)))
  534.                 return(1);
  535.               p=save;
  536.               scan=NextToken(scan);
  537.             } while ((scan != NULL) && (OpCode(scan) == MatchThisOrNext));
  538.             return(0);
  539.           }
  540.         break;
  541.       }
  542.       case MatchZeroOrMore:
  543.       case MatchOneOrMore:
  544.       {
  545.         register char
  546.           next_tokench,
  547.           *save;
  548.  
  549.         register int
  550.           min,
  551.           no;
  552.  
  553.         /*
  554.           Lookahead to avoid useless match attempts when we know what
  555.           character comes next_token.
  556.         */
  557.         next_tokench='\0';
  558.         if (OpCode(next_token) == MatchExactly)
  559.           next_tokench=(*Operand(next_token));
  560.         min=(OpCode(scan) == MatchZeroOrMore) ? 0 : 1;
  561.         save=p;
  562.         no=Repeat(Operand(scan));
  563.         while (no >= min)
  564.         {
  565.           /*
  566.             If it could work, try it.
  567.           */
  568.           if ((next_tokench == '\0') || (*p == next_tokench))
  569.             if (Match(next_token))
  570.               return(1);
  571.           /*
  572.             Couldn't or didn't -- back up.
  573.           */
  574.           no--;
  575.           p=save+no;
  576.         }
  577.         return(0);
  578.         break;
  579.       }
  580.       case EndOfProgram:
  581.         return(1);
  582.         break;
  583.       default:
  584.         (void) fprintf(stderr,"Regular(3): %s","memory corruption");
  585.         return(0);
  586.         break;
  587.     }
  588.     scan=next_token;
  589.   }
  590.   (void) fprintf(stderr,"Regular(3): %s","corrupted pointers");
  591.   return(0);
  592. }
  593.  
  594. /*
  595. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  596. %                                                                             %
  597. %                                                                             %
  598. %                                                                             %
  599. %   N e x t T o k e n                                                         %
  600. %                                                                             %
  601. %                                                                             %
  602. %                                                                             %
  603. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  604. %
  605. %
  606. */
  607. static char *NextToken(p)
  608. register char
  609.   *p;
  610. {
  611.   register int
  612.     offset;
  613.  
  614.   if (p == &start_code)
  615.     return(NULL);
  616.   offset=Next(p);
  617.   if (offset == 0)
  618.     return(NULL);
  619.   if (OpCode(p) == Back)
  620.     return(p-offset);
  621.   else
  622.     return(p+offset);
  623. }
  624.  
  625. /*
  626. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  627. %                                                                             %
  628. %                                                                             %
  629. %                                                                             %
  630. %   N o d e                                                                   %
  631. %                                                                             %
  632. %                                                                             %
  633. %                                                                             %
  634. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  635. %
  636. %
  637. */
  638. static char *Node(opcode)
  639. char
  640.   opcode;
  641. {
  642.   register char
  643.     *ptr,
  644.     *status;
  645.  
  646.   status=code;
  647.   if (status == &start_code)
  648.     {
  649.       code_size+=3;
  650.       return(status);
  651.     }
  652.   ptr=status;
  653.   *ptr++=opcode;
  654.   *ptr++='\0';
  655.   *ptr++='\0';
  656.   code=ptr;
  657.   return(status);
  658. }
  659.  
  660. /*
  661. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  662. %                                                                             %
  663. %                                                                             %
  664. %                                                                             %
  665. %   O p T a i l                                                               %
  666. %                                                                             %
  667. %                                                                             %
  668. %                                                                             %
  669. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  670. %
  671. %
  672. */
  673. static void OpTail(p,value)
  674. char
  675.   *p;
  676.  
  677. char
  678.   *value;
  679. {
  680.   void
  681.     Tail();
  682.  
  683.   /*
  684.     "Operandless" and "op != MatchThisOrNext" are synonymous in practice.
  685.   */
  686.   if ((p == NULL) || (p == &start_code) || (OpCode(p) != MatchThisOrNext))
  687.     return;
  688.   Tail(Operand(p),value);
  689. }
  690.  
  691. /*
  692. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  693. %                                                                             %
  694. %                                                                             %
  695. %                                                                             %
  696. %   P i e c e                                                                 %
  697. %                                                                             %
  698. %                                                                             %
  699. %                                                                             %
  700. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  701. %
  702. %
  703. */
  704. static char *Piece(flagp)
  705. int
  706.   *flagp;
  707. {
  708.   int
  709.     flags;
  710.  
  711.   register char
  712.     *next_token,
  713.     op,
  714.     *status;
  715.  
  716.   void
  717.     Tail();
  718.  
  719.   status=Atom(&flags);
  720.   if (status == NULL)
  721.     return(NULL);
  722.   op=(*token);
  723.   if (!MultipleMatches(op))
  724.     {
  725.       *flagp=flags;
  726.       return(status);
  727.     }
  728.   if (!(flags & NonNull) && op != '?')
  729.     Fail("*+ operand could be empty");
  730.   *flagp=(op != '+') ? (WorstCase | SpecialStart) : (WorstCase | NonNull);
  731.   if (op == '*' && (flags & Simple))
  732.     Insert(MatchZeroOrMore,status);
  733.   else
  734.     if (op == '*')
  735.       {
  736.         /*
  737.           Emit x* as (x&|), where & means "self".
  738.         */
  739.         Insert(MatchThisOrNext,status);  
  740.         OpTail(status,Node(Back)); 
  741.         OpTail(status,status); 
  742.         Tail(status,Node(MatchThisOrNext));  
  743.         Tail(status,Node(MatchEmptyString));
  744.       }
  745.     else
  746.       if ((op == '+') && (flags & Simple))
  747.         Insert(MatchOneOrMore,status);
  748.       else
  749.         if (op == '+')
  750.           {
  751.             /*
  752.               Emit x+ as x (&|), where & means "self".
  753.             */
  754.             next_token=Node(MatchThisOrNext);  
  755.             Tail(status,next_token);
  756.             Tail(Node(Back),status);
  757.             Tail(next_token,Node(MatchThisOrNext));
  758.             Tail(status,Node(MatchEmptyString));
  759.           }
  760.         else
  761.           if (op == '?')
  762.             {
  763.               /*
  764.                 Emit x? as (x|)
  765.               */
  766.               Insert(MatchThisOrNext,status);  
  767.               Tail(status,Node(MatchThisOrNext));  
  768.               next_token=Node(MatchEmptyString);  
  769.               Tail(status,next_token);
  770.               OpTail(status,next_token);
  771.             }
  772.   token++;
  773.   if (MultipleMatches(*token))
  774.     Fail("nested *?+");
  775.   return(status);
  776. }
  777.  
  778. /*
  779. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  780. %                                                                             %
  781. %                                                                             %
  782. %                                                                             %
  783. %   R e g u l a r                                                             %
  784. %                                                                             %
  785. %                                                                             %
  786. %                                                                             %
  787. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  788. %
  789. %
  790. */
  791. static char *Regular(parenthesized,flagp)
  792. int
  793.   parenthesized;
  794.  
  795. int
  796.   *flagp;
  797. {
  798.   int
  799.     flags;
  800.  
  801.   register char
  802.     *br,
  803.     *ender,
  804.     *status;
  805.  
  806.   register int
  807.     count;
  808.  
  809.   void
  810.     Tail();
  811.  
  812.   count=0;
  813.   *flagp=NonNull;
  814.   if (!parenthesized)
  815.     status=NULL;
  816.   else
  817.     {
  818.       /*
  819.         Make an Open node.
  820.       */
  821.       if (number_parenthesis >= NumberSubExpressions)
  822.         Fail("too many ()");
  823.       count=number_parenthesis;
  824.       number_parenthesis++;
  825.       status=Node(Open+count);
  826.     }
  827.   /*
  828.     Pick up the branches, linking them together.
  829.   */
  830.   br=Branch(&flags);
  831.   if (br == NULL)
  832.     return(NULL);
  833.   if (status != NULL)
  834.     Tail(status,br);
  835.   else
  836.     status=br;
  837.   if (!(flags & NonNull))
  838.     *flagp&=(~NonNull);
  839.   *flagp|=flags & SpecialStart;
  840.   while (*token == '|')
  841.   {
  842.     token++;
  843.     br=Branch(&flags);
  844.     if (br == NULL)
  845.       return(NULL);
  846.     Tail(status,br);
  847.     if (!(flags & NonNull))
  848.       *flagp &= ~NonNull;
  849.     *flagp|=flags & SpecialStart;
  850.   }
  851.   /*
  852.     Make a closing node and hook it on the end.
  853.   */
  854.   ender=Node((parenthesized) ? Close+count : EndOfProgram);
  855.   Tail(status,ender);
  856.   /*
  857.     Hook the tails of the branches to the closing node.
  858.   */
  859.   for(br=status; br != NULL; br=NextToken(br))
  860.     OpTail(br,ender);
  861.   /*
  862.     Check for proper termination.
  863.   */
  864.   if (parenthesized && (*token++ != ')'))
  865.     Fail("unmatched()")
  866.   else
  867.     if (!parenthesized && (*token != '\0'))
  868.       {
  869.         if (*token == ')')
  870.           Fail("unmatched()")
  871.         else
  872.           Fail("junk on end")
  873.        }
  874.   return(status);
  875. }
  876.  
  877. /*
  878. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  879. %                                                                             %
  880. %                                                                             %
  881. %                                                                             %
  882. %   R e p e a t                                                               %
  883. %                                                                             %
  884. %                                                                             %
  885. %                                                                             %
  886. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  887. %
  888. %
  889. */
  890. static int Repeat(p)
  891. char
  892.   *p;
  893. {
  894.   register char
  895.     *operand,
  896.     *scan;
  897.  
  898.   register int
  899.     count=0;
  900.  
  901.   scan=p;
  902.   operand=Operand(p);
  903.   switch(OpCode(p))
  904.   {
  905.     case MatchAnyCharacter:
  906.     {
  907.       count=strlen(scan);
  908.       scan+=count;
  909.       break;
  910.     }
  911.     case MatchExactly:
  912.     {
  913.       while (*operand == *scan)
  914.       {
  915.         count++;
  916.         scan++;
  917.       }
  918.       break;
  919.     }
  920.     case MatchAnyCharacterOf:
  921.     {
  922.       while ((*scan != '\0') && (strchr(operand,*scan) != NULL))
  923.       {
  924.         count++;
  925.         scan++;
  926.       }
  927.       break;
  928.     }
  929.     case MatchAnyCharacterBut:
  930.     {
  931.       while ((*scan != '\0') && (strchr(operand,*scan) == NULL))
  932.       {
  933.         count++;
  934.         scan++;
  935.       }
  936.       break;
  937.     }
  938.     default:
  939.     {
  940.       (void) fprintf(stderr,"Regular(3): %s","internal foulup");
  941.       count=0;
  942.       break;
  943.     }
  944.   }
  945.   p=scan;
  946.   return(count);
  947. }
  948.  
  949. /*
  950. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  951. %                                                                             %
  952. %                                                                             %
  953. %                                                                             %
  954. %   T a i l                                                                   %
  955. %                                                                             %
  956. %                                                                             %
  957. %                                                                             %
  958. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  959. %
  960. %
  961. */
  962. static void Tail(p,val)
  963. char
  964.   *p;
  965.  
  966. char
  967.   *val;
  968. {
  969.   register char
  970.     *scan,
  971.     *temp;
  972.  
  973.   register int
  974.     offset;
  975.  
  976.   if (p == &start_code)
  977.     return;
  978.   /*
  979.     Find last node.
  980.   */
  981.   scan=p;
  982.   for(;;)
  983.   {
  984.     temp=NextToken(scan);
  985.     if (temp == NULL)
  986.       break;
  987.     scan=temp;
  988.   }
  989.   if (OpCode(scan) == Back)
  990.     offset=scan-val;
  991.   else
  992.     offset=val-scan;
  993.   *(scan+1)=(offset >> 8) & 0377;
  994.   *(scan+2)=offset & 0377;
  995. }
  996.  
  997. /*
  998. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  999. %                                                                             %
  1000. %                                                                             %
  1001. %                                                                             %
  1002. %   T r y                                                                     %
  1003. %                                                                             %
  1004. %                                                                             %
  1005. %                                                                             %
  1006. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  1007. %
  1008. %
  1009. */
  1010. static int Try(regular_expression,pattern)
  1011. RegularExpression
  1012.   *regular_expression;
  1013.  
  1014. char
  1015.   *pattern;
  1016. {
  1017.   register char
  1018.     **ep,
  1019.     **sp;
  1020.  
  1021.   register int
  1022.     i;
  1023.  
  1024.   p=pattern;
  1025.   subpattern=regular_expression->subpattern;
  1026.   subpattern_end=regular_expression->subpattern_end;
  1027.   sp=regular_expression->subpattern;
  1028.   ep=regular_expression->subpattern_end;
  1029.   for(i=NumberSubExpressions; i > 0; i--)
  1030.   {
  1031.     *sp++=NULL;
  1032.     *ep++=NULL;
  1033.   }
  1034.   if (!Match(regular_expression->program+1))
  1035.     return(0);
  1036.   else
  1037.     {
  1038.       regular_expression->subpattern[0]=pattern;
  1039.       regular_expression->subpattern_end[0]=p;
  1040.       return(1);
  1041.     }
  1042. }
  1043.  
  1044. /*
  1045. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  1046. %                                                                             %
  1047. %                                                                             %
  1048. %                                                                             %
  1049. %   C o m p i l e R e g u l a r E x p r e s s i o n                           %
  1050. %                                                                             %
  1051. %                                                                             %
  1052. %                                                                             %
  1053. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  1054. %
  1055. %  Function CompileRegularExpression compiles a regular expression into a 
  1056. %  structure of type RegularExpression and returns a pointer to it.  The space 
  1057. %  is allocated using function malloc and may be released by function free.
  1058. %
  1059. %
  1060. */
  1061. RegularExpression *CompileRegularExpression(regular_expression)
  1062. char
  1063.   *regular_expression;
  1064. {
  1065.   int
  1066.     flags;
  1067.  
  1068.   register char
  1069.     *longest,
  1070.     *scan;
  1071.  
  1072.   register RegularExpression
  1073.     *r;
  1074.  
  1075.   register int
  1076.     length;
  1077.  
  1078.   if (regular_expression == NULL)
  1079.     Fail("NULL argument");
  1080.   /*
  1081.     First pass: determine size.
  1082.   */
  1083.   token=regular_expression;
  1084.   number_parenthesis=1;
  1085.   code_size=0L;
  1086.   code=(&start_code);
  1087.   EmitCode(Magick);
  1088.   if (Regular(0,&flags) == NULL)
  1089.     return(NULL);
  1090.   /*
  1091.     Allocate space.
  1092.   */
  1093.   r=(RegularExpression *)
  1094.     malloc((unsigned int) (code_size+sizeof(RegularExpression)));
  1095.   if (r == NULL)
  1096.     Fail("out of space");
  1097.   /*
  1098.     Second pass: emit code.
  1099.   */
  1100.   token=regular_expression;
  1101.   number_parenthesis=1;
  1102.   code=r->program;
  1103.   EmitCode(Magick);
  1104.   if (Regular(0,&flags) == NULL)
  1105.     return(NULL);
  1106.   /*
  1107.     Dig out information for optimizations.
  1108.   */
  1109.   r->start_character='\0';
  1110.   r->anchor=0;
  1111.   r->priority_pattern=NULL;
  1112.   r->pattern_length=0;
  1113.   scan=r->program+1;
  1114.   if (OpCode(NextToken(scan)) == EndOfProgram)
  1115.     {
  1116.       scan=Operand(scan);
  1117.       if (OpCode(scan) == MatchExactly)
  1118.         r->start_character=(*Operand(scan));
  1119.       else
  1120.         if (OpCode(scan) == MatchBeginningOfLine)
  1121.           r->anchor++;
  1122.       /*
  1123.         If there's something expensive in the regular expression, find the
  1124.         longest literal pattern that must appear and make it the 
  1125.         priority_pattern.
  1126.       */
  1127.       if (flags & SpecialStart)
  1128.         {
  1129.           longest=NULL;
  1130.           length=0;
  1131.           for(; scan != NULL; scan=NextToken(scan))
  1132.             if ((OpCode(scan) == MatchExactly) && 
  1133.                 (strlen(Operand(scan)) >= length))
  1134.               {
  1135.                 longest=Operand(scan);
  1136.                 length=strlen(Operand(scan));
  1137.               }
  1138.           r->priority_pattern=longest;
  1139.           r->pattern_length=length;
  1140.         }
  1141.     }
  1142.   return(r);
  1143. }
  1144.  
  1145. /*
  1146. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  1147. %                                                                             %
  1148. %                                                                             %
  1149. %                                                                             %
  1150. %   E x e c u t e R e g u l a r E x p r e s s i o n                           %
  1151. %                                                                             %
  1152. %                                                                             %
  1153. %                                                                             %
  1154. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  1155. %
  1156. %  Function ExecuteRegularExpression matches a NULL-terminated pattern against 
  1157. %  the compiled regular expression in regular-expression.  It returns 1 for 
  1158. %  success and 0 for failure.
  1159. %
  1160. %
  1161. */
  1162. int ExecuteRegularExpression(regular_expression,pattern)
  1163. register RegularExpression
  1164.   *regular_expression;
  1165.  
  1166. register char
  1167.   *pattern;
  1168. {
  1169.   register char
  1170.     *s;
  1171.  
  1172.   if ((regular_expression == (RegularExpression *) NULL) || 
  1173.       (pattern == (char *) NULL))
  1174.     {
  1175.       (void) fprintf(stderr,"Regular(3): %s","NULL parameter\n");
  1176.       return(0);
  1177.     }
  1178.   /*
  1179.     Check validity of program.
  1180.   */
  1181.   if (((int)*(unsigned char *)(regular_expression->program)) != Magick)
  1182.     {
  1183.       (void) fprintf(stderr,"Regular(3): %s","corrupted program");
  1184.       return(0);
  1185.     }
  1186.   /*
  1187.     If there is a "must appear" pattern, look for it.
  1188.   */
  1189.   if (regular_expression->priority_pattern != NULL)
  1190.     {
  1191.       s=pattern;
  1192.       while ((s=strchr(s,regular_expression->priority_pattern[0])) != NULL)
  1193.       {
  1194.         if (strncmp(s,regular_expression->priority_pattern,
  1195.             regular_expression->pattern_length) == 0)
  1196.           break;
  1197.         s++;
  1198.        }
  1199.        if (s == NULL)
  1200.          return(0);
  1201.     }
  1202.   /*
  1203.     Mark beginning of line for ^.
  1204.   */
  1205.   start_pattern=pattern;
  1206.   /*
  1207.     Simplest case:  anchored match need be tried only once.
  1208.   */
  1209.   if (regular_expression->anchor)
  1210.     return(Try(regular_expression,pattern));
  1211.   /*
  1212.     Messy cases:  unanchored match.
  1213.   */
  1214.   s=pattern;
  1215.   if (regular_expression->start_character != '\0')
  1216.     while ((s=strchr(s,regular_expression->start_character)) != NULL)
  1217.     {
  1218.       if (Try(regular_expression,s))
  1219.         return(1);
  1220.       s++;
  1221.     }
  1222.   else
  1223.     do
  1224.     {
  1225.       if (Try(regular_expression,s))
  1226.         return(1);
  1227.     } while (*s++ != '\0');
  1228.   return(0);
  1229. }
  1230.