Power-Programmierung

home *** CD-ROM | disk | FTP | other *** search

/ Power-Programmierung / CD2.mdf / c / library / os2 / thread / pmmthrd.c < prev next >

Wrap

C/C++ Source or Header | 1993-04-11 | 16.8 KB | 559 lines

/* PMNEWS 1.0 Threading functions for the PMNEWS news reader This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 1, as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See the file COPYING, which contains a copy of the GNU General Public License. */ #include <ctype.h> #include <string.h> #include <stdio.h> #include <stdlib.h> #include <os2.h> /* for typedefs used below */ #include "defs.h" #include "pmnews.h" /* Since this interface requires a one-pass algorithm * we need to tokenize each subject and analyze it's * meaingful tokens with respect to all previous tokens. * * Init time: * Allocate space for subject_token_space. * Initialize anchor for subject line (ARTICLE) list * * For each token: * Search against 'trivial_token_space'. * If found, iterate. * Count the token. * Hash against 'subject_token_space'. * If not found, * allocate and establish word_node. * enter this token in the new reference list. * iterate. * enter this token in the new reference list. * For each ARTICLE element on this word_node: * Search for match in temp_hit_list (a tree). * If found increment count. * else insert element in tree and in one-way list. * iterate. * end (while tokens) * * Search the temp_hit_list for the entry with the * highest count via the one-way list. * * If that count > 75% of the total token count * free temp_hit_list and new_reference_list * return TRUE with the associated ARTICLE ptr * else * allocate an ARTICLE. * save the token count in the 'left' pointer * queue the new_reference entries to their * respective word_nodes (using above ARTICLE). * free temp_hit_list and new_reference_list * return TRUE */ #define MAXTOKL 32 typedef struct hitent { ARTICLE *subjp; /* pointer to base article */ UINT nhits; /* number of times referenced */ struct hitent *hpleft, /* left branch of hit tree */ *hpright, /* right branch of hit tree */ *hpnext; /* one-way list */ } HITENT; typedef struct artref { struct artref * nextar; /* next in this chain */ struct artref * nextal; /* next in allocation chain */ ARTICLE * thread; /* pointer to thread containing this token */ } ARTREF; typedef struct tokel { char toktxt[MAXTOKL+1]; /* uppercased token */ BOOL trivial; /* token to be ignored */ struct tokel *overflow; /* collision ptr */ ARTREF *subhead; /* head of subject lines with this token */ } TOKEL; typedef struct newref { TOKEL *reftok; /* token referenced */ struct newref *nrnext; /* next new reference */ } NEWREF; typedef struct { TOKEL * next_ovflo; /* next available overflow */ TOKEL * subject_token_space; /* primary token area */ TOKEL * limit_addr; /* limit of acquired area */ ARTREF * headref; /* allocation chain head for ARTREF */ ARTICLE * mainseq; /* main chain of articles */ FILE * tlog; /* debug file */ } THREAD_ANCHOR; #define HASH_WIDTH 1023 static const char *trivia[] = { "THE", "IN", "A", "OF", "WITH", "TO", "BE", "IS", "FOR", "ON", "RE:", "AN", "WAS:", "AND", "HOW" }; static const UINT numtriv = (sizeof(trivia)/sizeof(trivia[0])); /* called at completion of thread pass for the newsgroup */ void thread_end( THREAD_ANCHOR * anch ) { ARTREF * warp, * warpn; #ifdef DEBUG fclose( anch->tlog ); #endif warp = anch->headref; while ( warp != NULL ) { warpn = warp; warp = warp->nextal; free( warpn ); } free( anch->subject_token_space ); free( anch ); return; } /* valid token chars, binary 0 is invalid, \b is ignored else it's the upper-case of char */ static const char sigchar[] = { '\0', '\0', '\0', '\0', '\0' , '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0' , '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0' , '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0' , '\0', '\0', '\0', '\0', '\b', '\b', '\b', '$' , '\b', '&', '\b', '\b', '\b', '\b', '\b', '\b' , '\b', '\0', '\b', /* period is whitespace */ '0', '1', '2', '3', '4' , '5', '6', '7', '8', '9', '\0', '\b', '\b' , '\b', '\b', '\b', '\b', 'A', 'B', 'C', 'D' , 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L' , 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T' , 'U', 'V', 'W', 'X', 'Y', 'Z', '\b', '\b' , '\b', '\b', '\b', '\b', 'A', 'B', 'C', 'D' , 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L' , 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T' , 'U', 'V', 'W', 'X', 'Y', 'Z', '\b', '\b' , '\b', '\b', '\0', '\0', '\0', '\0', '\0', '\0' , '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0' , '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0' , '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0' , '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0' , '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0' , '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0' , '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0' , '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0' , '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0' , '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0' , '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0' , '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0' , '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0' , '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0' , '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0' , '\0', '\0', '\0', }; /* obtain the next subject token this involves upper-casing and removing any punctuation pos is where the current position in the output 'line' is stored. inp = current input position return hash value of token or -1 if end-of-line */ static int nextokn( char ** inp, char **pos ) { char *cin, *cout, *hn; char ch; UINT hv = 0, hw = 0, cc = 0; cin = *inp; cout = *pos; hn = (char *)&hw; /* locate 1st significant char */ while ( ( *cin != '\0' ) && ( sigchar[*cin] <= '\b' ) ) cin++; if ( *cin == '\0' ) return -1; /* process chars until white-space, uppercasing */ while ( ( *cin != '\0' ) && ( ( ch = sigchar[*cin] ) != '\0' ) ) { if ( ch > '\b' ) { cc++; *hn = ch; hn++; if ( cc % sizeof(hv) == 0 ) { /* xor words together */ hv ^= hw; hn = (char *)&hw; hw = 0; } *cout = ch; cout++; } if ( cc == MAXTOKL ) break; cin++; } *cout = '\0'; *inp = cin; hv ^= hw; return hv % HASH_WIDTH; } /* free the hitlist and the new reference list */ static void free_lists( HITENT * hp, NEWREF * nr ) { HITENT *wh, *prvh; NEWREF *wr, *prvr; wh = hp; while ( wh != NULL ) { prvh = wh; wh = wh->hpnext; free( prvh ); } wr = nr; while ( wr != NULL ) { prvr = wr; wr = wr->nrnext; free( prvr ); } } /* threading algorithm returns a pointer which it wants passed to it with each new subject line from the index */ void * thread_init( void ) { THREAD_ANCHOR * tmp_anch; int hv; UINT ix; char outt[80]; char *op, *ip; TOKEL *tsp, *tsw; BOOL found; tmp_anch = xmalloc( sizeof(THREAD_ANCHOR) ); /* allocate subject token space */ tmp_anch->subject_token_space = xmalloc( sizeof(TOKEL) * (HASH_WIDTH+1)*2 ); tmp_anch->limit_addr = &(tmp_anch->subject_token_space[(HASH_WIDTH+1)*2]); tmp_anch->next_ovflo = &(tmp_anch->subject_token_space[HASH_WIDTH]); memset( tmp_anch->subject_token_space, 0, sizeof(TOKEL) * (HASH_WIDTH+1)*2 ); tmp_anch->mainseq = NULL; tmp_anch->headref = NULL; /* add the trivial tokens to the token space, marking them */ tsp = tmp_anch->subject_token_space; op = &outt[0]; for ( ix = 0; ix < numtriv; ix++ ) { ip = (char *) trivia[ix]; hv = nextokn( &ip, &op ); tsw = tsp + hv; found = FALSE; while ( tsw->toktxt[0] != '\0' ) { if ( strcmp( tsw->toktxt, outt ) == 0 ) { found = TRUE; break; } if ( tsw->overflow == NULL ) break; tsw = tsw->overflow; } if ( !found ) /* ignore duplicates */ { /* if this is a collision, allocate an overflow node */ if ( (tsp + hv)->toktxt[0] != '\0' ) { tsw = tmp_anch->next_ovflo; tmp_anch->next_ovflo++; /* push the new overflow onto the primary */ tsw->overflow = (tsp + hv)->overflow; (tsp + hv)->overflow = tsw; } /* build a word node */ strcpy( tsw->toktxt, outt ); /* overflow and subhead should be NULL already */ tsw->trivial = TRUE; } } /* for each trivial string */ #ifdef DEBUG tmp_anch->tlog = fopen("thread.log", "w" ); #endif return tmp_anch; } /* called for each new subject line. returns TRUE if it found a subject (ARTICLE) to thread-to, FALSE if it had to allocate an ARTICLE. 'targ' is stored in all cases */ BOOL thread_compare( char * subjline, THREAD_ANCHOR * anch, ARTICLE **targ ) { char *outp, *inp; UINT tcnt = 0; /* token counter */ UINT hcnt = 0; /* hit counter */ int hashv, hvalu; TOKEL *tsp, *tsw; BOOL found; ARTICLE *newa, *wap; ARTREF *wart; HITENT *hitanc, /* tree anchor of hits for current subject */ *hitlist, /* one-way list anchor */ *hitw, /* work ptr for hit searches */ **hitp; /* ptr to previous ptr in search */ NEWREF *anew, /* anchor of new references for this entry */ *refw; /* work ptr for new refs */ char worka[257]; /* output work area for tokenizer */ anew = NULL; hitanc = NULL; hitlist = NULL; inp = subjline; /* start of input string */ outp = &(worka[0]); /* output pointer start */ memset ( outp, 0, sizeof(worka) ); tsp = anch->subject_token_space; /* obtain the next token */ while ( ( hashv = nextokn( &inp, &outp ) ) != -1 ) { /* disregard trivial tokens */ /* look up the token in the subject token space */ tsw = tsp + hashv; found = FALSE; while ( tsw->toktxt[0] != '\0' ) { if ( strcmp( tsw->toktxt, outp ) == 0 ) { found = TRUE; break; } if ( tsw->overflow == NULL ) break; tsw = tsw->overflow; } if ( found ) { if ( tsw->trivial ) continue; } else { /* not found */ /* if this is a collision, allocate an overflow node */ if ( (tsp + hashv)->toktxt[0] != '\0' ) { tsw = anch->next_ovflo; if ( tsw >= anch->limit_addr ) { /* allocate ARTICLE struct and exit */ newa = xmalloc( sizeof(ARTICLE) ); *targ = newa; return FALSE; } /* cop out for too many tokens */ anch->next_ovflo++; /* push the new overflow onto the primary */ tsw->overflow = (tsp + hashv)->overflow; (tsp + hashv)->overflow = tsw; } /* build a word node */ strcpy( tsw->toktxt, outp ); /* overflow and subhead should be NULL already */ tsw->trivial = FALSE; /* push this on the new reference list */ refw = xmalloc( sizeof(NEWREF) ); refw->reftok = tsw; refw->nrnext = anew; anew = refw; tcnt++; /* count this token */ continue; } /* push this on the new reference list - this is done whether the token is known or not in case this is a NEW thread. If the token is already on list, skip it */ refw = anew; while ( refw != NULL ) { if ( refw->reftok == tsw ) break; refw = refw->nrnext; } if ( ( refw != NULL ) && ( refw->reftok == tsw ) ) continue; refw = xmalloc( sizeof(NEWREF) ); refw->reftok = tsw; refw->nrnext = anew; anew = refw; /* count this token */ tcnt++; /* we found the token, scan thru the attached ARTICLEs and merge pointers to them into the hit tree */ wart = tsw->subhead; while ( wart != NULL ) { hitw = hitanc; hitp = NULL; while ( ( hitw != NULL ) && ( hitw->subjp != wart->thread ) ) { if ( wart->thread > hitw->subjp ) { hitp = &hitw->hpleft; hitw = hitw->hpleft; } else { hitp = &hitw->hpright; hitw = hitw->hpright; } } if ( hitw == NULL ) { /* add new tree entry */ hitw = xmalloc( sizeof(HITENT) ); /* link into one-way list */ hitw->hpnext = hitlist; hitlist = hitw; /* if no previous node, set anchor, else store this element's ptr in correct branch of previous */ if ( hitp == NULL ) hitanc = hitw; else *hitp = hitw; hitw->subjp = wart->thread; hitw->hpleft = hitw->hpright = NULL; hitw->nhits = 0; } hitw->nhits++; /* increment hits */ wart = wart->nextar; } } /* while tokens */ /* search the hit tree as a list for the highest count within a similarly tokenized article */ hitw = hitlist; wap = NULL; hvalu = 0; while ( hitw != NULL ) { int hweight; hweight = hitw->nhits - (abs((int)hitw->subjp->left - tcnt)); if ( hweight > hvalu ) { hvalu = hweight; hcnt = hitw->nhits; wap = hitw->subjp; /* save associated ARTICLE */ } hitw = hitw->hpnext; } found = FALSE; if ( wap != NULL ) { int mcnt; /* retrieve the saved token count of the previous subject line with highest hit count (hcnt) */ mcnt = (int)wap->left; /* if the current token count is 3 or less: all tokens must match (order irrelevant). else the saved token count must be within 1 of the number of hits in the current subject. */ if ( tcnt <= 3 ) { if ( ( hcnt == mcnt ) && ( tcnt == mcnt ) ) found = TRUE; } else if ( abs(mcnt - hcnt) < 2 ) found = TRUE; else if ( ( hcnt > 5 ) && ( abs(mcnt - hcnt) < 3 ) ) found = TRUE; } /* see if we have enough tokens to be a match */ if ( found ) { /* subject to be threaded to a previous ARTICLE */ #ifdef DEBUG fprintf( anch->tlog, "match: %s\n %s\n hcnt: %d tcnt %d\n", subjline, wap->header, hcnt, tcnt ); #endif *targ = wap; /* previous article */ free_lists( hitlist, anew ); return TRUE; } else { /* this is a 'new' thread */ #ifdef DEBUG fprintf(anch->tlog, "new: %s hcnt: %d tcnt %d\n", subjline, hcnt, tcnt ); #endif newa = xmalloc( sizeof(ARTICLE) ); /* main program will fill in ARTICLE, but not modify 'left' or 'right' */ newa->left = (ARTICLE *)tcnt; /* save token count */ *targ = newa; if ( anch->mainseq == NULL ) anch->mainseq = newa; /* queue the new ARTICLE to its word nodes */ refw = anew; while ( refw != NULL ) { wart = xmalloc( sizeof(ARTREF) ); wart->thread = newa; wart->nextal = anch->headref; anch->headref = wart; wart->nextar = refw->reftok->subhead; refw->reftok->subhead = wart; refw = refw->nrnext; } free_lists( hitlist, anew ); return FALSE; } }