home *** CD-ROM | disk | FTP | other *** search
- /* compute frequency of words in file. read in any ascii file, and spits
- the result (as table) on stdout, or process on existing freqlist
-
- FILE/A,SERVER/S,FREQFILE/S,ENGLISHOPTI/S,HEAVY/S
-
- FILE: words to process
- SERVER: go into server mode
- FREQFILE: expect input to be in frequency list format or just any ascii text
- ENGLISHOPTI: perform merges on english words (note: needs multiple passes)
- HEAVY: do heavy english opti (does more damage to semantics :-)
- FREQFACTOR: minimum factor of frequency for output words in server mode [default: 100]
-
- */
-
- OPT REG=5,OSVERSION=37
-
- MODULE 'tools/file', 'tools/exceptions', 'class/hash', 'tools/ctype', 'tools/arexx',
- 'tools/constructors', 'exec/nodes', 'exec/lists'
-
- OBJECT hlink OF hashlink
- count, sig
- ENDOBJECT
-
- CONST NUMTOP=1000
-
- DEF ght:PTR TO hashtable,pht:PTR TO hashtable, -> silly
- gsize,psize, -> number of words
- isheavy=FALSE,iseng=FALSE,
- minsig=100,top:PTR TO lh,largest
-
- PROC main() HANDLE
- DEF m,l,ht=NIL:PTR TO hashtable,myargs:PTR TO LONG,rdargs=NIL
- myargs:=[0,0,0,0,0,0]
- IF (rdargs:=ReadArgs('FILE/A,SERVER/S,FREQFILE/S,ENGLISHOPTI/S,HEAVY/S,FREQFACTOR/N',myargs,NIL))=NIL THEN Raise("ARGS")
- m,l:=readfile(myargs[0])
- ght:=NEW ht.hashtable(HASH_HEAVIER)
- gsize:=IF iseng:=myargs[2] THEN process_fl(m,l,ht) ELSE process(m,l,ht)
- IF gsize<1 THEN gsize:=1
- isheavy:=myargs[4]
- IF myargs[5] THEN minsig:=Long(myargs[5])
- IF myargs[3] THEN ht.iterate({engfilter})
- IF myargs[1] THEN server() ELSE ht.iterate({print})
- EXCEPT DO
- IF rdargs THEN FreeArgs(rdargs)
- report_exception()
- ENDPROC
-
- PROC process(mem,len,ht:PTR TO hashtable,listd=NIL)
- DEF p,c,a,b,h,end,hl:PTR TO hlink,numw=0,list
- end:=mem+len
- p:=mem
- LOOP
- SELECT 128 OF c:=p[]++
- CASE "\n"
- IF p>end THEN RETURN numw
- CASE "A" TO "Z", "a" TO "z"
- IF c<="Z" THEN p[-1]:=c+32
- a:=p-1
- WHILE isalpha(c:=p[])
- IF c<="Z" THEN p[]:=c+32
- p++
- ENDWHILE
- hl,h:=ht.find(a,b:=p-a)
- IF hl=NIL THEN ht.add(NEW hl,h,a,b)
- hl.count:=hl.count+1
- numw++
- p[]++:=0
- IF listd
- ^listd:=list:=NEW [NIL,hl]:LONG
- listd:=list
- ENDIF
- ENDSELECT
- ENDLOOP
- ENDPROC
-
- PROC process_fl(m,l,ht:PTR TO hashtable)
- DEF b,h,end,hl:PTR TO hlink,v,s,numw=0
- end:=m+l
- s:=m
- WHILE s<end
- v,b:=Val(s)
- s:=s+b+1
- b:=s
- WHILE b[]<>"\n" DO b++
- b:=b-s
- hl,h:=ht.find(s,b)
- IF hl=NIL THEN ht.add(NEW hl,h,s,b)
- hl.count:=hl.count+v
- numw:=numw+v
- s:=s+b
- s[]++:=0
- ENDWHILE
- ENDPROC numw
-
- /*
- ["."=checked, "*"=sem_danger, "#"=not_impl]
-
- safe extension optimisations:
-
- .. Xed X | Xe conversed
- . Xied Xy crucified
- . XYYed XY crammed, abhorred
- .. Xing X | Xe conspiring
- . Xan Xa american, an
- . Xian Xy | Xia hungarian, australian
- . Xier Xy copier
- . Xs X conveys, as?, this?
- . Xous X courageous
- . Xies Xy contemporaries
- . Xness X remoteness
- Xy Xe argueably
- .# Xly X | Xe convincingly
- . Xility Xle intangibility
- . Xacy Xate indelicacy
-
- less safe extension optimisations:
-
- activities -> activity -> active -> act
-
- * Xic X alcoholic
- * Xive X | Xe constructive
- ** Xable X | Xe argueable
- ### Xial X | Xe | Xia residential
- # Xtial Xce consequential
- * Xism X alcoholism
- * Xion X | Xe damnation, deallocation
- * Xor X | Xe coordinator
- # Xious Xy ceremonious
- ## Xant X | Xe colorant
- ## Xment X | Xe containment
- # Xlet X booklet
- .*# Xily X | Xe | Xy particularily, family?
- * Xity X actuality
-
- not used for now:
-
- Xves Xfe leaves
- Xer X | Xe manager?
- Xward X upward, awkward?, reward?
- Xar X singular?
- Xss
- Xibly
- Xend
-
- safe prefix optimisations:
-
- unX X unacceptable -> same as "not X"
- imX X imperfect, image?
- inX X incoherent
-
- less safe prefix optimisations:
-
- deX X decompression
- reX X rebuilt
- misX X misguided
-
- not used for now:
-
- overX X overflow?
- preX X prefixed?
- disX X dissatisfied, discover?
- upX X uproar?
- superX X superimpose?
- nonX X nondeterministically?
-
- */
-
- PROC engfilter(tl:PTR TO hlink,d)
- DEF l,s,hl=NIL:PTR TO hlink,v,w,x,y,z,t[100]:STRING,min=3 ->4?
- l:=tl.len
- s:=tl.data
- z:=s[l-1]
- IF l>1
- y:=s[l-2]
- IF l>2
- x:=s[l-3]
- IF l>3
- w:=s[l-4]
- IF l>4 THEN v:=s[l-4]
- ENDIF
- ENDIF
- ENDIF
- SELECT 128 OF z
- CASE "c"
- IF y="i" THEN hl:=fh(s,l-2) -> ic
- CASE "d"
- IF y="e"
- IF x="i" -> ied
- hl:=suf(t,s,l-3,'y')
- ELSEIF x=w -> XXed
- hl:=f(s,l-3)
- ELSE -> ed
- IF (hl:=f(s,l-2))=NIL THEN hl:=f(s,l-1)
- ENDIF
- ENDIF
- CASE "e"
- IF (x="i") AND (y="v") -> ive
- IF (hl:=fh(s,l-3))=NIL THEN hl:=sufh(t,s,l-3,'e')
- ELSEIF (w="a") AND (x="b") AND (y="l")
- IF (hl:=fh(s,l-4))=NIL THEN hl:=sufh(t,s,l-4,'e')
- ENDIF
- CASE "g"
- IF (x="i") AND (y="n") -> ing
- IF (hl:=f(s,l-3))=NIL THEN hl:=suf(t,s,l-3,'e')
- ENDIF
- CASE "m"
- IF (x="i") AND (y="s") THEN hl:=fh(s,l-3) -> ism
- CASE "n"
- IF y="a"
- IF x="i" -> ian
- IF (hl:=suf(t,s,l-3,'y'))=NIL THEN hl:=f(s,l-1)
- ELSE -> an
- hl:=f(s,l-1)
- ENDIF
- ELSEIF (y="o") AND (x="i") -> ion
- IF (hl:=fh(s,l-3))=NIL THEN hl:=sufh(t,s,l-3,'e')
- ENDIF
- CASE "r"
- IF y="o" -> or
- IF (hl:=fh(s,l-2))=NIL THEN hl:=sufh(t,s,l-2,'e')
- ENDIF
- CASE "s"
- IF (x="o") AND (y="u") -> ous
- hl:=f(s,l-3)
- ELSEIF (x="i") AND (y="e") -> ies
- hl:=suf(t,s,l-3,'y')
- ELSEIF (w="n") AND (x="e") AND (y="s") -> ness
- hl:=f(s,l-4)
- ELSE -> s
- hl:=f(s,l-1)
- ENDIF
- CASE "y"
- IF y="l"
- IF x="i" -> ily
- IF (hl:=fh(s,l-3))=NIL THEN hl:=sufh(t,s,l-3,'e')
- ELSE -> ly
- hl:=f(s,l-2)
- ENDIF
- ELSEIF (y="t") AND (x="i")
- IF (v="i") AND (w="l") -> ility
- hl:=suf(t,s,l-5,'le')
- ELSE -> ity
- hl:=fh(s,l-3)
- ENDIF
- ELSEIF y="c" -> acy
- hl:=suf(t,s,l-2,'te')
- ELSE -> y
- hl:=suf(t,s,l-1,'e')
- ENDIF
- ENDSELECT
- IF hl=NIL
- min:=4
- IF IF ((x:=s[])="u") THEN s[1]="n" ELSE IF x="i" THEN -> un/in/im
- ((y:=s[1])="n") OR (y="m") ELSE FALSE
- hl:=f(s+2,l-2)
- ENDIF
- ENDIF
- IF hl
- IF (hl.len>=min) AND ((hl.count>1) OR (hl.count=0))
- hl.count:=hl.count+tl.count
- tl.count:=0
- ENDIF
- ENDIF
- ENDPROC
-
- PROC suf(dest,src,len,suf)
- StrCopy(dest,src,len)
- StrAdd(dest,suf)
- ENDPROC ght.find(dest,EstrLen(dest))
-
- PROC f(s,l) IS ght.find(s,l)
- PROC fh(s,l) IS IF isheavy THEN ght.find(s,l) ELSE NIL
- PROC sufh(d,s,l,su) IS IF isheavy THEN suf(d,s,l,su) ELSE NIL
-
- PROC print(l:PTR TO hlink,d)
- PrintF('\d[8]\t\s\n',l.count,l.data)
- ENDPROC
-
- PROC server()
- WriteF('Starting Arexx Server, port: "FREQPORT", commands: "QUIT", "FREQ"\n')
- rx_HandleAll({process_msg},'FREQPORT')
- ENDPROC
-
- PROC process_msg(s)
- DEF cl,a=NIL,q=FALSE
- IF (cl:=InStr(s,' '))>0 THEN a:=s+cl+1
- IF StrCmp(s,'QUIT',cl)
- WriteF('Terminating server.\n')
- q:=TRUE
- ELSEIF StrCmp(s,'FREQ',cl)
- WriteF('Processing file "\s".\n',a)
- do(a)
- ELSE
- WriteF('Unknown Command: "\s"\n',s)
- ENDIF
- ENDPROC q,0,NIL
-
- PROC do(filename) HANDLE
- DEF m=NIL,l,ht=NIL:PTR TO hashtable,list=NIL
- top:=pht:=NIL
- m,l:=readfile(filename)
- pht:=NEW ht.hashtable(HASH_HEAVY)
- psize:=process(m,l,ht,{list})
- IF psize<1 THEN psize:=1
- IF iseng THEN ht.iterate({engfilter})
- WriteF('word ratio = \d:\d\n',gsize,psize)
- largest:=0
- ht.iterate({significant})
- top:=newlist()
- largest:=largest/127+1
- ht.iterate({sort})
- writenewfile(list,filename)
- writetop(filename)
- EXCEPT DO
- END top
- IF pht THEN pht.end_links(SIZEOF hlink)
- END pht
- IF m THEN freefile(m)
- report_exception()
- ENDPROC
-
- PROC significant(phl:PTR TO hlink,d)
- DEF numg=1,nump,hl:PTR TO hlink,sig
- nump:=phl.count
- IF hl:=ght.find(phl.data,phl.len) THEN numg:=hl.count
- IF numg<1 THEN numg:=1
- IF nump<1 THEN nump:=1
- phl.sig:=sig:=Div(Div(gsize,numg),Div(psize,nump))
- IF sig>largest THEN largest:=sig
- ENDPROC
-
- PROC sort(phl:PTR TO hlink,d)
- IF phl.sig>minsig THEN Enqueue(top,newnode(NIL,phl,0,phl.sig/largest))
- ENDPROC
-
- PROC writenewfile(list:PTR TO LONG,fn)
- DEF hl:PTR TO hlink,o:PTR TO LONG,fh,nfn[200]:STRING,numc=0
- StrCopy(nfn,fn)
- StrAdd(nfn,'.sig')
- IF fh:=Open(nfn,NEWFILE)
- WHILE o:=list
- hl:=list[1]
- list:=list[]
- END o[2]
- IF hl.sig>minsig
- IF numc+hl.len+1>78 THEN (numc:=0) BUT FputC(fh,"\n")
- Fputs(fh,hl.data)
- FputC(fh," ")
- numc:=numc+hl.len+1
- ENDIF
- ENDWHILE
- FputC(fh,"\n")
- Close(fh)
- ELSE
- WriteF('Problem opening "\s"\n',nfn)
- ENDIF
- ENDPROC
-
- PROC writetop(fn)
- DEF n:PTR TO ln,o,fh,nfn[200]:STRING,hl:PTR TO hlink,num,totsig=0,f
- StrCopy(nfn,fn)
- StrAdd(nfn,'.top')
- IF fh:=Open(nfn,NEWFILE)
- n:=top.head; num:=0
- WHILE o:=n.succ
- hl:=n.name; num++; EXIT num=NUMTOP; totsig:=totsig+hl.sig; n:=o
- ENDWHILE
- f:=totsig/num/100+1
- WriteF('tot=\d,num=\d,f=\d\n',totsig,num,f)
- n:=top.head; num:=0
- WHILE o:=n.succ
- hl:=n.name; num++; EXIT num=NUMTOP; VfPrintf(fh,'\s:\d\n',[hl.data,hl.sig/f]:LONG); END n; n:=o
- ENDWHILE
- FputC(fh,"\n")
- Close(fh)
- ELSE
- WriteF('Problem opening "\s"\n',nfn)
- ENDIF
- ENDPROC
-