home *** CD-ROM | disk | FTP | other *** search
- From: jerry@olivey.olivetti.com (Jerry Aguirre)
- Newsgroups: news.software.b,alt.sources
- Subject: Tool to find duplicate articles
- Message-ID: <49290@olivea.atc.olivetti.com>
- Date: 16 Aug 90 19:06:16 GMT
-
- Here is a tool I thru together when my news history got corrupted and
- users started complaining about seeing duplicates of articles.
-
- ===BEGIN histdups.c===
- #include <stdio.h>
- #define LINESIZ 1024
- #define MAXF 32
-
- /* Expects the stdin to be the history file, sorted. Stdout is a list
- * of file names which are duplicates of earlier articles. Run after
- * expire -r and then "rm" the files listed in the output.
- *
- * sort <history | histdups >dupfiles; xargs <dupfiles rm
- *
- * If the news history becomes corrupted then you can wind up with
- * duplicates. These are both a waste of space and a pain for people
- * reading news.
- *
- * B news expire -r will find the dups and then enter all of them into
- * the history file. (It doesn't even match up the cross postings
- * to each other correctly.) This program will output the names of all
- * but the first duplicate in each news group. (Where "first" is based
- * on article numbering which presumably represents arrival order.)
- *
- * 16Aug90 Jerry Aguirre <jerry@atc.olivetti.com>
- */
-
- char files[MAXF][LINESIZ];
- int nf;
-
- long atol();
- char *index();
-
- main()
- {
- char c, *p;
- int i, j;
- char line[LINESIZ];
- char id[LINESIZ];
- char lastline[LINESIZ];
-
- nf = 0;
- id[0] = '\0';
- lastline[0] = '\0';
- while (gets(line)) {
- p = index(line, '\t');
- if (p) {
- *p = '\0';
- if (strcmp(line, id) == 0) { /* we have a dup */
- if (lastline[0] != '\0') {
- parsefiles(lastline);
- lastline[0] = '\0';
- }
- *p = '\t';
- parsefiles(line);
- } else {
- printdups();
- strcpy(id, line);
- *p = '\t';
- strcpy(lastline, line);
- nf = 0;
- }
- }
- }
- }
- parsefiles(line) char *line;
- {
- char *pd, *pf, *p;
-
- pd = index(line, '\t');
- if (pd) pd++;
- else return;
- pf = index(pd, '\t');
- if (pf) pf++;
- else return;
- while (*pf) {
- while (*pf == ' ') pf++;
- if (*pf == '\0') return;
- if (nf >= MAXF) return;
- p = index(pf, ' ');
- if (p) *p = '\0';
- strcpy(files[nf], pf);
- nf++;
- if (p) {
- pf = p + 1;
- *p = ' ';
- }
- else return;
- }
- }
-
- printdups()
- {
- int i1, i2, flags[MAXF];
- long n1, n2;
- char *p1, *p2;
-
- for (i1 = 0; i1 < nf; i1++) flags[i1] = 0;
-
- for (i1 = 0; i1 < nf; i1++) {
- p1 = index(files[i1], '/');
- if (!p1) continue;
- *p1 = '\0';
- n1 = atol(p1+1);
- for (i2 = i1 + 1; i2 < nf; i2++) {
- p2 = index(files[i2], '/');
- if (!p2) continue;
- *p2 = '\0';
- if (strcmp(files[i1], files[i2]) == 0) { /* same group */
- n2 = atol(p2+1);
- if (n2 > n1) flags[i2] = 1; /* lowest number stays */
- else if (n2 < n1) flags[i1] = 1;
- }
- *p2 = '/';
- n2 = atol(p2+1);
- }
- *p1 = '/';
- }
- for (i1 = 0; i1 < nf; i1++) {
- if (flags[i1] == 1) {
- for (p1 = files[i1]; *p1; p1++) {
- if (*p1 == '.') putchar('/');
- else putchar(*p1);
- }
- putchar('\n');
- }
- }
- }
- ===END histdups.c===
-