home *** CD-ROM | disk | FTP | other *** search
- { BOYERMO1.PAS ( 22 January 1988) (Rufus S. Hendon) }
-
- { This unit provides facilities for searching a text for a target using
- the Boyer-Moore search method. The routine is based on Don Strenczewilk's
- implementation of a variant form of the Boyer-Moore method (his case-
- insensitive version B1, available on CompuServe in file BLINE.ARC in
- Borland BPROGA Data Library 4, uploaded 21 August 1987). In addition to
- repackaging his routine as a Turbo Pascal 4.0 unit, I have modified it
- (1) to provide protection against endless loops that in the original
- version can arise due to wrap-around of the index used to scan the text
- when the length of the text approaches the maximum (65521 characters)
- allowed by Turbo Pascal 4.0 for an array of type CHAR and (2) to improve
- efficiency slightly by removing three instructions (a PUSH, a MOV, and a
- POP) from the comparison loop.
- The text to be searched must be stored in an array of type CHAR or an
- equivalent user-defined type. The lower bound of the array must be 1.
- The target for which the text is to be searched must be of type STRING.
- Whenever the text is to be searched for a new target, the program must
- call MAKE_BOYER_MOORE_TABLE to have the shift table required by the Boyer-
- Moore method created and to inform the unit of the target to be used when
- the text is searched. The function BOYER_MOORE_SEARCH searches the text
- for the target specified in the most recent call to MAKE_BOYER_MOORE_TABLE,
- beginning the search at a position in the text specified by the argument
- associated with the parameter START. To search the entire text, the
- function would be invoked with START = 1. The function scans the text
- beginning from the START position for the first substring that matches the
- target. If such a substring is found, the function returns the position
- (array subscript) of the initial character of the matching substring;
- since the array is required to have 1 as its lower bound, the position
- returned after a successful search will always be greater than 0. If the
- function fails to find a matching substring, it returns 0.
- When it is required that all occurrences of the target in the text be
- found, BOYER_MOORE_SEARCH would be invoked in a loop, in which the START
- argument would initially have the value of 1; thereafter, after every
- successful search, the START argument would be reset to the position
- returned by the function plus 1. The loop would terminate when the
- function reported failure. The loop would have a general structure
- similar to this:
-
- item := [the target string];
- make_Boyer_Moore_table(item);
- scan_beginning := 1;
- search_text_length := length(search_text);
- repeat
- i := Boyer_Moore_search(search_text,scan_beginning,search_text_length);
- if i > 0 then begin
- [do whatever processing is required when the search is successful];
- scan_beginning := i+1
- end
- until i = 0
-
- Note that if the text array can only be referred to by means of a
- pointer, as will be the case if the array is allocated in the heap by
- means of the NEW procedure, the pointer, when used as the first argument
- of BOYER_MOORE_SEARCH, must be dereferenced by writing '^' after it. If,
- for example, TEXTPTR is a pointer to the text array, the call to the
- search function in the loop just given would take this form:
-
- i := Boyer_Moore_search(textptr^,scan_beginning,search_text_length);
- }
- {============================================================================}
- unit BOYERMO1;
- {============================================================================}
- interface
-
- procedure MAKE_BOYER_MOORE_TABLE(var target: string);
- { TARGET is the target string for which a text is to be searched. A shift
- table for the target string is constructed in private storage in the unit
- and a copy of the target string is also stored inside the unit for
- subsequent use by BOYER_MOORE_SEARCH. }
-
- function BOYER_MOORE_SEARCH(var text_array; start, text_length: word):
- word;
- { TEXT_ARRAY is an array of characters in which a text is stored; the
- text begins in TEXT_ARRAY[1] and is TEXT_LENGTH characters long. A
- Boyer-Moore search is performed on the text in TEXT_ARRAY, beginning with
- the character in position START, for the first substring that matches
- the target string specified in the most recent invocation of
- MAKE_BOYER_MOORE_TABLE. If a match is found, the position of the first
- character of the matching substring is returned. Otherwise 0 is
- returned. }
- {============================================================================}
- implementation
-
- const
- copy: string = ''; { initialized to an empty string to protect against }
- { the invocation of BOYER_MOORE_SEARCH without a }
- { prior call to MAKE_BOYER_MOORE_TABLE }
- var
- table: array[char] of byte;
- {****************************************************************************}
- procedure MAKE_BOYER_MOORE_TABLE(var target: string);
- { TARGET is the target string for which a text is to be searched. A shift
- table for the target string is constructed in private storage in the unit
- and a copy of the target string is also stored inside the unit for
- subsequent use by BOYER_MOORE_SEARCH. }
- begin { MAKE_BOYER_MOORE_TABLE }
- inline
- ($1E/ { push ds }
- $1E/ { push ds }
- $07/ { pop es }
- $BF/>copy/ { mov di,offset copy }
- $C5/$76/<target/ { lds si,[bp].target }
- $8B/$DE/ { mov bx,si }
- $32/$E4/ { xor ah,ah }
- $8A/$04/ { mov al,[si] }
- $26/$88/$05/ { mov es:[di],al }
- $FC/ { cld }
- $0A/$C0/ { or al,al }
- $74/$06/ { jz tbl }
- $46/ { inc si }
- $47/ { inc di }
- $8B/$C8/ { mov cx,ax }
- $F3/$A4/ { rep movsb }
- $BF/>table/ { tbl: mov di,offset table }
- $8B/$F3/ { mov si,bx }
- $8B/$D7/ { mov dx,di }
- $8A/$E0/ { mov ah,al }
- $B9/>$0080/ { mov cx,80h }
- $F3/$AB/ { rep stosw }
- $8B/$F3/ { mov si,bx }
- $8B/$FA/ { mov di,dx }
- $46/ { inc si }
- $98/ { cbw }
- $3C/$01/ { cmp al,1 }
- $7E/$13/ { jle done }
- $48/ { dec ax }
- $8A/$CC/ { mov cl,ah }
- $8A/$FC/ { mov bh,ah }
- $8A/$1C/ { next: mov bl,[si] }
- $8B/$D0/ { mov dx,ax }
- $2B/$D1/ { sub dx,cx }
- $88/$11/ { mov [bx+di],dl }
- $46/ { inc si }
- $41/ { inc cx }
- $3B/$C8/ { cmp cx,ax }
- $75/$F2/ { jne next }
- $1F) { done: pop ds }
- end; { MAKE_BOYER_MOORE_TABLE }
- {****************************************************************************}
- function BOYER_MOORE_SEARCH(var text_array; start, text_length: word): word;
- { TEXT_ARRAY is an array of characters in which a text is stored; the
- text begins in TEXT_ARRAY[1] and is TEXT_LENGTH characters long. A
- Boyer-Moore search is performed on the text in TEXT_ARRAY, beginning with
- the character in position START, for the first substring that matches
- the target string specified in the most recent invocation of
- MAKE_BOYER_MOORE_TABLE. If a match is found, the position of the first
- character of the matching substring is returned. Otherwise 0 is
- returned. }
- begin { BOYER_MOORE_SEARCH }
- inline
- ($1E/ { push ds }
- $33/$C0/ { xor ax,ax }
- $8B/$D0/ { mov dx,ax }
- $BE/>copy/ { mov si,offset copy }
- $8A/$14/ { mov dl,[si] }
- $80/$FA/$01/ { cmp dl,1 }
- $7F/$1F/ { jg boyer }
- $7C/$6E/ { jl notfound2 }
- $8A/$44/$01/ { mov al,[si+1] }
- $8B/$56/<start/ { mov dx,[bp+start] }
- $4A/ { dec dx }
- $8B/$4E/<text_length/ { mov cx,[bp+text_length] }
- $2B/$CA/ { sub cx,dx }
- $C4/$7E/<text_array/ { les di,[bp+text_array] }
- $8B/$DF/ { mov bx,di }
- $03/$FA/ { add di,dx }
- $FC/ { cld }
- $F2/$AE/ { repne scasb }
- $75/$56/ { jne notfound2 }
- $97/ { xchg ax,di }
- $2B/$C3/ { sub ax,bx }
- $EB/$53/ { jmp short exit }
- $FE/$CA/ { boyer: dec dl }
- $03/$F2/ { add si,dx }
- $C4/$7E/<text_array/ { les di,[bp+text_array] }
- $8B/$CF/ { mov cx,di }
- $03/$4E/<text_length/ { add cx,[bp+text_length] }
- $49/ { dec cx }
- $4F/ { dec di }
- $03/$7E/<start/ { add di,[bp+start] }
- $03/$FA/ { add di,dx }
- $8A/$74/$01/ { mov dh,[si+1] }
- $BB/>table/ { mov bx,offset table }
- $55/ { push bp }
- $8B/$E9/ { mov bp,cx }
- $8A/$EC/ { mov ch,ah }
- $FD/ { std }
- $EB/$05/ { jmp short comp }
- $D7/ { nexttable: xlat }
- $03/$F8/ { add di,ax }
- $72/$2A/ { jc notfound }
- $3B/$EF/ { comp: cmp bp,di }
- $72/$26/ { jb notfound }
- $26/$8A/$05/ { mov al,es:[di] }
- $3A/$F0/ { cmp dh,al }
- $75/$F0/ { jne nexttable }
- $4F/ { dec di }
- $8A/$CA/ { mov cl,dl }
- $F3/$A6/ { repe cmpsb }
- $74/$0D/ { je found }
- $8A/$C2/ { mov al,dl }
- $2B/$C1/ { sub ax,cx }
- $03/$F8/ { add di,ax }
- $47/ { inc di }
- $03/$F0/ { add si,ax }
- $8A/$C6/ { mov al,dh }
- $EB/$DC/ { jmp short nexttable }
- $5D/ { found: pop bp }
- $C4/$46/<text_array/ { les ax,[bp+text_array] }
- $97/ { xchg ax,di }
- $2B/$C7/ { sub ax,di }
- $40/ { inc ax }
- $40/ { inc ax }
- $EB/$03/ { jmp short exit }
- $5D/ { notfound: pop bp }
- $32/$C0/ { notfound2: xor al,al }
- $89/$46/$FE/ { exit: mov [bp-2],ax }
- $FC/ { cld }
- $1F) { pop ds }
- end; { BOYER_MOORE_SEARCH }
- {****************************************************************************}
- end.