home *** CD-ROM | disk | FTP | other *** search
- (* :Title: String Patterns *)
-
- (* :Context: Examples`StringPatterns` *)
-
- (* :Author: John M. Novak *)
-
- (* :Summary: Implements a method of more general pattern
- matching with strings than the built-in * and @.
- *)
-
- (* :Package Version: 2.0 *)
-
- (* :Mathematica Version: 2.0 *)
-
- (* :History:
- V 2.0, John M. Novak, July 1991 : switch to Mathematica-like
- patterns.
- V 1.0, John M. Novak, June 1991
- *)
-
- (* :Keywords:
- pattern matching, strings
- *)
-
- (* :Warning:
- Adds rules to StringMatchQ and StringPosition.
- *)
-
- (* :Limitation:
- Certain complex patterns take considerably longer to
- evaluate than one might expect...
- *)
-
- (* :Limitation:
- Many instances of variables of the form $nnn may be left
- behind if any of the special patterns (as described below)
- are used.
- *)
-
- (* :Discussion:
- This package adds the capability of Mathematica pattern
- matching for strings. It allows the use of standard Mathematica
- patterns connected by StringJoin to be matched to a string,
- using the functions StringMatchQ and StringPosition.
- The patterns treat a string as a sequence of characters, in
- direct correlation to sequences in normal Mathematica expressions.
- Thus, we can have the pattern:
- "a"<>___<>"b"
- which matches to a string that starts with a, ends with b, and
- has zero or more characters in between. All standard Mathematica
- patterns are allowed.
- There are also several enhancements, dealing with matching to
- a sequence of characters, for instance:
- "a"<>("bc"..)<>"d"
- matches any string beginning with a, ending with d, and with
- one or more occurences of the substring "bc" in between.
- Note that this diverges from standard Mathematica patterns, as
- (b,c).. is not normally a valid pattern. The patterns with
- this enhancement are Repeated, RepeatedNull, and Alternatives.
-
- The implementation is simple enough. The basic match is
- performed by separating the target string into a list of
- characters, and modifying the pattern to be a list of
- characters and patterns. The problems come with the
- new pattern cases - replacement rules are used to replace
- the new patterns with valid old patterns that do the
- same thing - unfortunately, the old patterns are often
- rather complex, involving extensive use of Condition[] and
- unique new variables. This can cause slow evaluation in
- some cases.
- *)
-
- (* We need to make sure that the usage messages for StringMatchQ
- and StringPosition are loaded before we attempt to modify
- them. *)
-
- If[!ValueQ[MessageName[StringMatchQ,"usage"]],
- $NewMessage[StringMatchQ,"usage"]
- ];
-
- If[!ValueQ[MessageName[StringPosition,"usage"]],
- $NewMessage[StringPosition,"usage"]
- ];
-
- BeginPackage["Examples`StringPatterns`"]
-
- StringMatchQ::usage = MessageName[StringMatchQ,"usage"] <>
- "\nStringMatchQ[string,pattern] determines whether or not
- the string matches the pattern, where the pattern is as
- described in the discussion section of the StringPattern
- package."
-
- StringPosition::usage = MessageName[StringPosition,"usage"] <>
- "\nStringPosition[string,pattern] finds occurences of the
- pattern in the string, where the pattern is as described in
- the discussion section of the StringPattern package. In this
- usage, an additional option, AnchorMatch, is allowed."
-
- AnchorMatch::usage =
- "An option for StringPosition, used only with a string
- pattern. Determines whether a pattern should be anchored
- to the head or tail of the string. Accepts the values of
- AnchorHead, AnchorTail, or False."
-
- AnchorHead::usage =
- "A value for the option AnchorMatch. Determines that
- the pattern will be anchored to the beginning of the
- string.";
-
- AnchorTail::usage =
- "A value for the option AnchorMatch. Determines that
- the pattern will be anchored to the end of the
- string.";
-
- CharacterRange::usage =
- "CharacterRange[char1,char2] generates a pattern for use
- with StringMatchQ and StringPosition. The pattern matches
- any character between char1 and char2, as determined by
- $StringOrder. Note that this is not case sensitive."
-
- Begin["`Private`"]
-
- Unprotect[StringMatchQ,StringPosition];
-
- StringMatchQ[str_String,pat_?PatternHeadQ,opts___] :=
- StringMatchQ[str,StringJoin[pat],opts]
-
- StringMatchQ[stringi_String,
- StringJoin[pati___],opts___] :=
- Module[{ic,string,pat},
- {ic} = {IgnoreCase}/.{opts}/.Options[StringMatchQ];
- If[TrueQ[ic],
- string = ToLowerCase[stringi];
- pat = pati/.s_String :> ToLowerCase[s],
- string = stringi;
- pat = pati
- ];
- MatchQ[Characters[string],
- Map[If[Head[#] === String,
- Sequence @@ Characters[#],
- ReleaseHold[#]]&,
- {pat}/.
- {Literal[Repeated][s_String]:>
- repstrrule[Unique[],s],
- Literal[RepeatedNull][s_String]:>
- nrepstrrule[Unique[],s],
- Literal[Repeated][Literal[Alternatives][
- p1___,p2_String?(StringLength[#] > 1&),
- p3___]] :>
- repaltstrrule[Unique[],p1,p2,p3],
- Literal[RepeatedNull][Literal[Alternatives][
- p1___,p2_String?(StringLength[#] > 1&),
- p3___]] :>
- nrepaltstrrule[Unique[],p1,p2,p3],
- Literal[Alternatives][f___,
- s_String?(StringLength[#] > 1 &),
- l___] :>
- altstrrule[Unique[],{f,s,l}]
- }
- ]
- ]
- ]
-
- StringPosition::anchor =
- "Warning: the option AnchorMatch is valid only if using
- a pattern of the sort defined in the StringPatterns
- package!";
-
- Options[StringPosition] =
- Append[Options[StringPosition],AnchorMatch->False];
-
- StringPosition[str_String,pat_?PatternHeadQ,opts___] :=
- StringPosition[str,StringJoin[pat],opts]
-
- StringPosition[istr_String,ipat_StringJoin,opts___] :=
- Module[{ic,ov,am,str,pat},
- {ic,ov,am} = {IgnoreCase,Overlaps,AnchorMatch}/.
- {opts}/.Options[StringPosition];
- If[TrueQ[ic],
- str = ToLowerCase[istr];
- pat = ipat/.s_String :> ToLowerCase[s],
- str = istr;
- pat = ipat
- ];
- findsegs[am,TrueQ[ov],str,pat]
- ]
-
- StringPosition[arg1___,AnchorMatch -> _,arg2___] :=
- (Message[StringPosition::anchor];
- StringPosition[arg1,arg2])
-
- Protect[StringMatchQ,StringPosition];
-
- (* utility function that returns True if the Head of the expr
- is a known pattern, False otherwise. *)
-
- PatternHeadQ[expr_] :=
- MemberQ[{Repeated,
- RepeatedNull,
- Alternatives,
- Pattern,
- Blank,
- BlankNullSequence,
- BlankSequence,
- Condition,
- PatternTest},
- Head[expr]
- ]
-
- (* utility function that returns True if comp is composed of
- str repeated, False otherwise. *)
-
- RepeatedStringQ[str_String,comp___] :=
- With[{ln = Length[{comp}]/StringLength[str]},
- StringJoin[Table[str,{ln}]] ===
- StringJoin[comp]/;Head[ln] === Integer
- ]
-
- RepeatedStringQ[___] := False
-
- (* similar to above, except if comp is empty (Null) then
- the pattern matches as well. *)
-
- RepeatedNullStringQ[str_,comp___] :=
- TrueQ[{comp} === {} || RepeatedStringQ[str,comp]]
-
- (* Note that RulDelayed::rhs needs to be shut off for a bit -
- Many of the following rules define patterns, which confuses
- SetDelayed, generating a warning. The warning can be safely
- ignored here, so I just shut it off temporarily. *)
-
- Off[RuleDelayed::rhs]
-
- (* replacement patterns for new pattern types... *)
- (* repeated string *)
-
- repstrrule[var_,str_] :=
- Hold[var__/;RepeatedStringQ[str,var]]
-
- (* repeatednull string *)
-
- nrepstrrule[var_,str_] :=
- Hold[var___/;RepeatedNullStringQ[str,var]]
-
- (* repeated with string alternatives *)
-
- repaltstrrule[var_,pats___] :=
- Hold[var___/;rpats[{var},pats]]
-
- (* repeatednull with string alternatives *)
-
- nrepaltstrrule[var_,pats___] :=
- Hold[var___/;apats[{var},pats]]
-
- (* string alternatives - splits up into string and pattern
- alternates, then calls alts sub-rule *)
-
- altstrrule[var_,args_] :=
- Module[{sargs,oargs},
- sargs = Select[args,(Head[#] === String &&
- StringLength[#] > 1)&];
- oargs = Complement[args,sargs];
- alts[var,sargs,oargs]
- ]
-
- (* sub-rule for alternatives - does the match *)
-
- alts[var_,sargs_,{oargs___}] :=
- Alternatives[var___/;MemberQ[sargs,StringJoin[var]],
- oargs]
-
- (* sub-rule for nrepaltstrrule, splits up strings and other
- patterns, then maps an individual comparison across each
- piece of the input *)
-
- apats[input_,pats___] :=
- Module[{spats,opats},
- spats = Select[{pats},Head[#] === String &];
- opats = Complement[{pats},spats];
- MatchQ[input,Map[
- ReleaseHold[ipat[Unique[],spats,opats]]&,input]]
- ]
-
- (* sub-rule for repaltstrrule, much as apats, but uses slightly
- different test for first element. *)
-
- rpats[input_,pats___] :=
- Module[{pat,spats,opats},
- spats = Select[{pats},Head[#] === String &];
- opats = Complement[{pats},spats];
- MatchQ[input,
- Prepend[Map[
- ReleaseHold[ipat[Unique[],spats,opats]]&,input],
- ReleaseHold[inpat[Unique[],spats,opats]]]
- ]
- ]
-
- (* sub-rule for apats, does individual comparison *)
-
- ipat[var_,spats_,opats_] :=
- Hold[var___/;Or[MemberQ[Append[spats,""],StringJoin[var]],
- MatchQ[{var},{Alternatives @@ opats}]]]
-
- inpat[var_,spats_,opats_] :=
- Hold[var___/;Or[MemberQ[spats,StringJoin[var]],
- MatchQ[{var},{Alternatives @@ opats}]]]
-
- (* Pattern for range of characters *)
-
- CharacterRange[c1_String,c2_String] :=
- _?(OrderedQ[{c1,#,c2}]&)
-
- On[RuleDelayed::rhs]
-
- (* Handles the various combinations of an anchored match and
- overlaps for StringPosition. Usually does something like
- generating a table of substring positions, then going
- through and extracting them and comparing each to the
- pattern. Returns the list of sub-string positions where
- the match is good. (within the constraints of the options.) *)
-
- findsegs[AnchorHead,True,str_,pat_] :=
- Module[{n,m,ln = StringLength[str]},
- Select[Table[{1,n},{n,ln}],
- StringMatchQ[StringTake[str,#],pat]&
- ]
- ]
-
- findsegs[AnchorHead,False,str_,pat_] :=
- Module[{n,m,segs,ln = StringLength[str]},
- segs = Table[{1,n},{n,ln}];
- n = 1;
- While[!StringMatchQ[StringTake[str,segs[[n]]],
- pat] && n <= Length[segs],
- n += 1];
- If[n <= Length[segs],{segs[[n]]},{}]
- ]
-
- findsegs[AnchorTail,True,str_,pat_] :=
- Module[{n,m,ln = StringLength[str]},
- Select[Table[{n,ln},{n,ln,1,-1}],
- StringMatchQ[StringTake[str,#],pat]&]
- ]
-
- findsegs[AnchorTail,False,str_,pat_] :=
- Module[{n,m,ln = StringLength[str],segs},
- segs = Table[{n,ln},{n,ln,1,-1}];
- n = 1;
- While[!StringMatchQ[StringTake[str,segs[[n]] ],
- pat] && n <= Length[segs],
- n += 1];
- If[n <= Length[segs],{segs[[n]]},{}]
- ]
-
- findsegs[_,True,str_,pat_] :=
- Module[{n,m,ln = StringLength[str]},
- Select[Flatten[Table[{n,m},{n,ln},{m,n,ln}],1],
- StringMatchQ[StringTake[str,#],pat]&]
- ]
-
- findsegs[_,False,str_,pat_] :=
- Module[{n,m,ln = StringLength[str],out = {}},
- m = 1;
- While[m <= ln,
- n = m;
- While[n <= ln && !StringMatchQ[
- StringTake[str,{m,n}],pat],
- n += 1];
- If[n <= ln,
- AppendTo[out,{m,n}];m = n,
- m += 1]
- ];
- out
- ]
-
- findsegs[___] := {}
-
- End[]
-
- EndPackage[]
-