/* Copyright (C) 2014 Robert, robert@dummy.us.eu.org */ /* This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include #include #include #include #ifdef GDBM #include #include #include #include #include #endif extern char threechars[][4]; int _IO_stderr_; /* don't under why this is necessary */ #define HASHTABLESIZ 524288 // 262144 // 65536 #define MAXWORDS 262144 // 65536 #define BOB 1 /* http://burtleburtle.net/bob/hash/hashfaq.html#unique Hash Function FAQ */ static int findprefix(const char* word) { int i; for (i = 1; threechars[i][0]; i++) { int v = strncmp(word, threechars[i], 3); if (!v) { return 1; } if (v < 0) return 0; } return 0; } #ifdef GDBM struct wordhasharray { int freq; unsigned long hash; char* str; char* orig; struct phrasehashent* phrase; int misspelled; }; struct wordhashtable { GDBM_FILE gdbm; int numents; struct phrasehashent* phrase; }; struct phrasehashent { struct wordhashtable* table; int freq; int initfreq; }; struct phrasehashtable { struct phrasehashent table; int numents; }; static void inithashtable(struct wordhashtable *table) { table->numents = 0; table->phrase = NULL; } static void initphrasehashtable(struct phrasehashtable *table) { table->numents = 0; table->table.table = NULL; } static void gdbm_fatal() { fprintf(stderr, "gdbm fatal\n"); exit(1); } static void cleanphrasehashtable(AspellSpeller* speller, struct phrasehashtable *table) { datum key; key = gdbm_firstkey(table->table.table->gdbm); while (key.dptr) { datum nextkey; datum content; nextkey = gdbm_nextkey(table->table.table->gdbm, key); content = gdbm_fetch(table->table.table->gdbm, key); if (content.dsize == 1 && content.dptr[0] <= '1') { gdbm_delete(table->table.table->gdbm, key); free(content.dptr); } else { char contentstr[BUFSIZ]; int freq; strncpy(contentstr, content.dptr, content.dsize); contentstr[content.dsize] = '\0'; freq = atoi(contentstr); freq--; free(content.dptr); sprintf(contentstr, "%d", freq); content.dptr = contentstr; content.dsize = strlen(contentstr); gdbm_store(table->table.table->gdbm, key, content, GDBM_REPLACE); } free(key.dptr); key = nextkey; } gdbm_reorganize(table->table.table->gdbm); } static int timetocheck(const char* fn) { struct stat st; int ret = 0; if (stat(fn, &st) == 0) { time_t now; struct tm tm; time(&now); if (localtime_r(&now, &tm)) { time_t saturday; saturday = now - tm.tm_sec - (tm.tm_min * 60) - (tm.tm_hour * 60 * 60) - (((tm.tm_wday + 1) % 7) * 60 * 60 * 24); if (st.st_mtime < saturday) { ret = 1; } } } return ret; } static void readphrases(AspellSpeller* speller, struct wordhashtable *hashtable, struct phrasehashtable* table, const char* fn, int rflag) { char newfn[BUFSIZ]; int needscleaning = 0; strcpy(newfn, fn); strcat(newfn, ".db"); if (!rflag) { needscleaning = timetocheck(newfn); } for (int times = 0; times < 10; times++) { hashtable->gdbm = gdbm_open(newfn, BUFSIZ, rflag ? GDBM_READER : GDBM_WRCREAT, 0644, gdbm_fatal); if (hashtable->gdbm) { break; } sleep(2); } if (hashtable->gdbm == NULL) { int c; fprintf(stderr, "phrases: cannot open %s: %s\n", newfn, gdbm_strerror(gdbm_errno)); while ((c = getchar()) != EOF) { putchar(c); } exit(1); } table->table.table = hashtable; hashtable->phrase = &table->table; if (needscleaning) { cleanphrasehashtable(speller, table); } } static void closephrases(struct phrasehashtable* table) { gdbm_close(table->table.table->gdbm); table->table.table->gdbm = 0; } static struct wordhasharray* addword(AspellSpeller* speller, struct wordhashtable *table, const char* word, int initialization, int rflag) { struct wordhasharray* arr; arr = (struct wordhasharray*)malloc(sizeof *arr); arr->freq = arr->hash = 0; arr->phrase = table->phrase; arr->str = arr->orig = strdup(word); arr->misspelled = !aspell_speller_check(speller, word, strlen(word)); } static struct phrasehashent* addphrase(AspellSpeller* speller, struct phrasehashtable* table, struct wordhasharray** array, int rflag) { datum key; char keystr[BUFSIZ]; datum content; strcpy(keystr, array[0]->str); strcat(keystr, "\t"); strcat(keystr, array[1]->str); key.dptr = keystr; key.dsize = strlen(keystr); content = gdbm_fetch(table->table.table->gdbm, key); // IMPROVEMENT // store phrase in memory, along with freq and increment when in // read-only mode; this will make it behave like the old version if (content.dptr) { char contentstr[BUFSIZ]; strncpy(contentstr, content.dptr, content.dsize); contentstr[content.dsize] = '\0'; table->table.freq = table->table.initfreq = atoi(contentstr); table->table.freq++; free(content.dptr); if (!rflag) { sprintf(contentstr, "%d", table->table.freq); content.dptr = contentstr; content.dsize = strlen(contentstr); gdbm_store(table->table.table->gdbm, key, content, GDBM_REPLACE); } } else { if (!rflag) { content.dptr = "1"; content.dsize = 1; gdbm_store(table->table.table->gdbm, key, content, GDBM_REPLACE); } table->table.freq = table->table.initfreq = 1; table->numents++; } return &table->table; } static void writephrases(struct phrasehashtable* table, const char* fn) { } #else // GDBM struct wordhashtable { struct wordhasharray* table[HASHTABLESIZ]; int numents; struct wordhasharray array[MAXWORDS]; int numwords; }; struct phrasehashent { unsigned long hash; int freq; int initfreq; int len; struct wordhasharray** words; struct phrasehashent** prev; struct phrasehashent** next; }; struct phrasehashtable { struct phrasehashent table[HASHTABLESIZ]; int numents; }; static unsigned long hashword(const char* word) { const char* p; unsigned long hash; hash = 0; for (p = word; *p; p++) { #ifdef BOB /* see above */ hash += tolower(*p); hash += (hash<<10); hash ^= (hash>>6); #else hash += (hash << 2) + tolower(*p); #endif } return(hash); } struct sorthasharray { float freq; struct wordhasharray* ent; }; static int sorthasharray(const void* a, const void* b) { struct sorthasharray* aa = (struct sorthasharray*)a; struct sorthasharray* bb = (struct sorthasharray*)b; #if 0 return(aa->freq > bb->freq ? 1 : (aa->freq < bb->freq ? -1 : 0)); #else return(aa->freq < bb->freq ? 1 : (aa->freq > bb->freq ? -1 : 0)); #endif } static void cleanhashtable(AspellSpeller* speller, struct wordhashtable *table); static int hashtablelookup1(struct wordhashtable* table, unsigned long hash, int usepointer, const char* word, int allowfail, int rflag) { int orig_ent; int ent; //printf("hashtablelookup: hash=%lu\n", hash); orig_ent = ent = hash % HASHTABLESIZ; while (table->table[ent]) { if (table->table[ent]->hash == hash && ((usepointer && table->table[ent]->str == word) || !strcmp(table->table[ent]->str, word))) { // printf("hashtablelookup: found existing word\n"); break; } ent = (ent + 1) % HASHTABLESIZ; if (ent == orig_ent) { if (allowfail) { ent = -1; } break; } } return(ent); } static int hashtablelookup(struct wordhashtable* table, unsigned long *hash, int usepointer, const char* word, int rflag) { *hash = hashword(word); if (!strcmp(word, "(null)")) { word = 0; } return(hashtablelookup1(table, *hash, usepointer, word, 1, rflag)); } static void cleanhashtable(AspellSpeller* speller, struct wordhashtable *table) { struct sorthasharray array[MAXWORDS]; unsigned int i; unsigned int ent; unsigned int ent1; unsigned int orig_ent1; unsigned int numfound; for (i = 0; i < (sizeof array) / (sizeof *array); i++) { #if 0 float wordfact; int freq; int times; struct phrasehashent* phrase; int len; wordfact = 1; if (!findprefix(table->array[i].str)) { wordfact = 0.5; } if (table->array[i].misspelled || !aspell_speller_check(speller, table->array[i].str, len = strlen(table->array[i].str))) { wordfact *= 0.5; table->array[i].misspelled = 1; } for (times = 0, freq = table->array[i].freq, phrase = table->array[i].phrase; phrase && times < table->array[i].freq; times++) { freq += phrase->freq; if (phrase->words != 0) { if (phrase->words[0] == table->array + i) phrase = phrase->next[0]; else if (phrase->words[1] == table->array + i) phrase = phrase->next[1]; else phrase = 0; } else phrase = 0; } array[i].freq = freq * wordfact; #else array[i].freq = table->array[i].freq; #endif array[i].ent = table->array + i; } qsort(array, (sizeof array) / (sizeof *array), sizeof *array, sorthasharray); for (numfound = i = 0; #if 0 // seems to prevent infinite loops numfound < ((sizeof array) / (sizeof *array)) / 8 && i < ((sizeof array) / (sizeof *array)) #else i < ((sizeof array) / (sizeof *array)) / 8 #endif ; i++) { //struct phrasehashent* phrase; unsigned long hash; ent = hashtablelookup1(table, array[i].ent->hash, 1, array[i].ent->str, 0, 0); if (table->table[ent]) { int last_ent1; numfound++; table->table[ent] = NULL; for (last_ent1 = orig_ent1 = ent, ent1 = (ent + 1) % HASHTABLESIZ; table->table[ent1] && table->table[ent1]->hash % HASHTABLESIZ == ent; ent1 = (ent1 + 1) % HASHTABLESIZ) { table->table[(ent1 + HASHTABLESIZ - 1) % HASHTABLESIZ] = table->table[ent1]; last_ent1 = ent1; if (ent1 == orig_ent1) break; } table->table[last_ent1] = NULL; } free(array[i].ent->str); fprintf(stderr, "phrases: deleting '%s' (freq=%f)\n", array[i].ent->orig, array[i].freq); free(array[i].ent->orig); hash = array[i].ent->hash; //phrase = array[i].ent->phrase; memset(array[i].ent, 0, sizeof *array[i].ent); //array[i].ent->phrase = phrase; array[i].ent->hash = hash; table->numents--; } } static int findfreeword(AspellSpeller* speller, struct wordhashtable* table) { int i; if (table->numwords < MAXWORDS && table->array[table->numwords].freq == 0) { return(table->numwords++); } for (;;) { for (i = 0; table->array[i].freq != 0 && i < MAXWORDS; i++); if (i < MAXWORDS) { return(i); } fprintf(stderr, "findfreeword: hash table full\n"); cleanhashtable(speller, table); } return(0); } static struct wordhasharray* addword(AspellSpeller* speller, struct wordhashtable *table, const char* word, int initialization, int rflag) { int wordent; int ent; unsigned long hash; char localcopy[BUFSIZ]; const char* local = localcopy; char* p; const char* q; if (initialization) { local = word; } else { for (p = localcopy, q = word; *q; q++, p++) *p = tolower(*q); *p = '\0'; } ent = -1; while (ent < 0) { ent = hashtablelookup(table, &hash, 0, local, rflag); if (ent >= 0) break; //fprintf(stderr, "phrases: hashtablelookup1: hash table full\n"); cleanhashtable(speller, table); } if (table->table[ent]) { // printf("addword: found %s\n", local); table->table[ent]->freq++; if (strcmp(table->table[ent]->orig, word)) { if (table->table[ent]->orig != table->table[ent]->str) { free(table->table[ent]->orig); } table->table[ent]->orig = strdup(word); } } else { wordent = findfreeword(speller, table); memset(&table->array[wordent], 0, sizeof table->array[wordent]); table->array[wordent].freq = 1; table->array[wordent].hash = hash; if (!strcmp(local, "(null)")) { table->array[wordent].str = 0; } else { table->array[wordent].str = strdup(local); } if (word == local || !strcmp(local, word)) { table->array[wordent].orig = table->array[wordent].str; } else { table->array[wordent].orig = strdup(word); } // printf("addword: wordent=%d, %s\n", wordent, local); table->table[ent] = table->array + wordent; table->numents++; } return(table->table[ent]); } static void inithashtable(struct wordhashtable *table) { memset(table, 0, sizeof *table); } static void initphrasehashtable(struct phrasehashtable *table) { memset(table, 0, sizeof *table); } static void dumphashtable(struct wordhashtable *table) { int i; for (i = 0; i < HASHTABLESIZ; i++) { if (table->table[i]) printf("%s %d\n", table->table[i]->str, table->table[i]->freq); } } static void dumpdocument(struct wordhasharray* const* array, int num) { int i; for (i = 0; i < num; i++) { printf("%s %d", array[i]->str, array[i]->freq); printf(" -- phrase %s %s %d\n", array[i]->phrase->words[0]->str, array[i]->phrase->words[1]->str, array[i]->phrase->freq); } } static unsigned long hashaddrs(const char* word, int len) { const char* p; unsigned long hash; hash = 0; for (p = word; p - word < len; p++) { #ifdef BOB /* see above */ hash += tolower(*p); hash += (hash<<10); hash ^= (hash>>6); #else hash += (hash << 2) + tolower(*p); #endif } return(hash); } struct sortphrases { float freq; struct phrasehashent* ent; }; static int sortphrasehashents(const void* a, const void* b) { struct sortphrases* aa = (struct sortphrases*)a; struct sortphrases* bb = (struct sortphrases*)b; #if 0 return(aa->ent->words && (aa->ent->words[0]->str == 0 || aa->ent->words[1]->str == 0) ? -1 : (bb->ent->words && (bb->ent->words[0]->str == 0 || bb->ent->words[1]->str == 0) ? 1 : (aa->freq > bb->freq ? 1 : (aa->freq < bb->freq ? -1 : 0)))); #else return(aa->freq > bb->freq ? 1 : (aa->freq < bb->freq ? -1 : 0)); #endif } static int findphrase1(AspellSpeller* speller, struct phrasehashtable* table, unsigned long hash, struct wordhasharray* const* array, int canclean, int rflag); static void cleanphrasehashtable(AspellSpeller* speller, struct phrasehashtable *table) { struct sortphrases phrases[MAXWORDS]; unsigned int i; unsigned int ent; unsigned int ent1; unsigned int orig_ent1; unsigned int numfound; for (i = 0; i < (sizeof phrases) / (sizeof *phrases); i++) { float word1fact; float word2fact; word1fact = word2fact = 1; if (table->table[i].words[0]->str == 0) word1fact = 0; else { int len; if (!findprefix(table->table[i].words[0]->str)) { word1fact *= 0.25; table->table[i].words[0]->misspelled = 1; } else if (table->table[i].words[0]->misspelled || !aspell_speller_check(speller, table->table[i].words[0]->str, len = strlen(table->table[i].words[0]->str))) { word1fact *= 0.5; table->table[i].words[0]->misspelled = 1; } } if (table->table[i].words[1]->str == 0) word2fact = 0; else { int len; if (!findprefix(table->table[i].words[1]->str)) { word2fact *= 0.25; table->table[i].words[1]->misspelled = 1; } else if (table->table[i].words[1]->misspelled || !aspell_speller_check(speller, table->table[i].words[1]->str, len = strlen(table->table[i].words[1]->str))) { word2fact *= 0.5; table->table[i].words[1]->misspelled = 1; } } phrases[i].freq = table->table[i].freq * word1fact + table->table[i].freq * word2fact; // (float)(table->table[i].freq * word1fact) / table->table[i].words[0]->freq + // (float)(table->table[i].freq * word2fact) / table->table[i].words[1]->freq; phrases[i].ent = table->table + i; } qsort(phrases, (sizeof phrases) / (sizeof *phrases), sizeof *phrases, sortphrasehashents); for (numfound = i = 0; #if 1 // seems to prevent infinite loops numfound < ((sizeof phrases) / (sizeof *phrases)) / 64 && i < ((sizeof phrases) / (sizeof *phrases)) #else i < ((sizeof phrases) / (sizeof *phrases)) / 64 #endif ;) { if (phrases[i].ent->words) { ent = findphrase1(speller, table, phrases[i].ent->hash, phrases[i].ent->words, 0, 0); if (table->table[ent].words) { int last_ent1; unsigned long saved_last_ent_hash; struct phrasehashent** saved_last_ent_next; struct phrasehashent** saved_last_ent_prev; unsigned long saved_hash; struct phrasehashent** saved_next; struct phrasehashent** saved_prev; numfound++; saved_last_ent_hash = saved_hash = table->table[ent].hash; saved_last_ent_next = saved_next = table->table[ent].next; saved_last_ent_prev = saved_prev = table->table[ent].prev; //fprintf(stderr, "phrases: eliminating '%s %s' (freq=%f)\n", // table->table[ent].words[0] // ? table->table[ent].words[0]->orig // : "(unknown)", // table->table[ent].words[1] // ? table->table[ent].words[1]->orig // : "(unknown)", // phrases[i].freq); if (table->table[ent].words[0]) { if (table->table[ent].words[0]->freq > 0) { table->table[ent].words[0]->freq--; } } if (table->table[ent].words[1]) { if (table->table[ent].words[1]->freq > 0) { table->table[ent].words[1]->freq--; } } memset(&table->table[ent], 0, sizeof table->table[ent]); last_ent1 = ent; for (orig_ent1 = ent1, ent1 = (ent + 1) % HASHTABLESIZ; table->table[ent1].words && table->table[ent1].hash % HASHTABLESIZ == ent; ent1 = (ent1 + 1) % HASHTABLESIZ) { table->table[(ent1 + HASHTABLESIZ - 1) % HASHTABLESIZ] = table->table[ent1]; last_ent1 = ent1; saved_last_ent_hash = table->table[ent1].hash; saved_last_ent_next = table->table[ent1].next; saved_last_ent_prev = table->table[ent1].prev; if (ent1 == orig_ent1) break; } memset(&table->table[last_ent1], 0, sizeof table->table[last_ent1]); table->table[last_ent1].hash = saved_last_ent_hash; table->table[last_ent1].next = saved_last_ent_next; table->table[last_ent1].prev = saved_last_ent_prev; table->table[ent].hash = saved_hash; table->table[ent].next = saved_next; table->table[ent].prev = saved_prev; continue; } } i++; } for (; i < (sizeof phrases) / (sizeof *phrases); i++) { // attempt to phase-out old phrases if (phrases[i].ent->freq > 1) phrases[i].ent->freq--; } } static int findphrase1(AspellSpeller* speller, struct phrasehashtable* table, unsigned long hash, struct wordhasharray* const* array, int canclean, int rflag) { int orig_ent; int ent; orig_ent = ent = hash % HASHTABLESIZ; while (table->table[ent].words) { //printf("findphrase1: %s\n", table->table[ent].words[0]->str); if (table->table[ent].hash == hash && !memcmp(table->table[ent].words, array, (sizeof *array) * 2)) { break; } ent = (ent + 1) % HASHTABLESIZ; if (orig_ent == ent) { if (!rflag) { if (canclean) { //fprintf(stderr, "phrases: phrase hash table full\n"); cleanphrasehashtable(speller, table); } else { fprintf(stderr, "phrases: findphrase1: phrase hash table full; clobbering entry\n"); break; } } else { break; } } } return(ent); } static int findphrase(AspellSpeller* speller, struct phrasehashtable* table, unsigned long* hash, struct wordhasharray* const* array, int rflag) { *hash = hashaddrs((const char*)array, (sizeof *array) * 2); return(findphrase1(speller, table, *hash, array, 1, rflag)); } static struct phrasehashent* addphrase(AspellSpeller* speller, struct phrasehashtable* table, struct wordhasharray** array, int rflag) { int ent; unsigned long hash; ent = findphrase(speller, table, &hash, array, rflag); if (rflag) { return table->table + ent; } if (table->table[ent].words) { table->table[ent].freq = table->table[ent].initfreq + 1; } else { table->table[ent].initfreq = 0; table->table[ent].freq = 1; table->table[ent].len = 2; table->table[ent].hash = hash; table->table[ent].words = (struct wordhasharray**)calloc(2, sizeof *array); memcpy(table->table[ent].words, array, (sizeof *array) * 2); table->numents++; table->table[ent].prev = (struct phrasehashent**)calloc(2, sizeof *table->table[ent].prev); table->table[ent].next = (struct phrasehashent**)calloc(2, sizeof *table->table[ent].next); table->table[ent].next[0] = array[0]->phrase; if (array[0]->phrase) array[0]->phrase->prev[0] = table->table + ent; array[0]->phrase = table->table + ent; if (array[0] != array[1]) { table->table[ent].next[1] = array[1]->phrase; if (array[1]->phrase) array[1]->phrase->prev[1] = table->table + ent; array[1]->phrase = table->table + ent; } //printf("addphrase: new phrase %s %s\n", // table->table[ent].words[0]->str, // table->table[ent].words[1]->str); } return table->table + ent; } static void readphrases(AspellSpeller* speller, struct wordhashtable *hashtable, struct phrasehashtable* table, const char* fn, int rflag) { FILE* fp; char buf[BUFSIZ]; char str1[BUFSIZ]; char str2[BUFSIZ]; struct wordhasharray* words[2]; int freq; struct phrasehashent* phrase; if ((fp = fopen(fn, "r"))) { while (fgets(buf, sizeof buf, fp)) { if (sscanf(buf, "%d\t%s\t%s\n", &freq, str1, str2) == 3) { words[0] = addword(speller, hashtable, str1, 1, rflag); words[1] = addword(speller, hashtable, str2, 1, rflag); phrase = addphrase(speller, table, words, rflag); phrase->freq = phrase->initfreq = freq; } } fclose(fp); } else { perror(fn); } } static void writephrases(struct phrasehashtable* table, const char* fn) { FILE* fp; int i; if ((fp = fopen(fn, "w"))) { for (i = 0; i < HASHTABLESIZ; i++) { if (table->table[i].freq > 0 && table->table[i].words[0]->str != 0 && table->table[i].words[1]->str != 0) { fprintf(fp, "%d\t%s\t%s\n", table->table[i].freq, table->table[i].words[0]->str, table->table[i].words[1]->str); } } fclose(fp); } else { perror(fn); } } static void closephrases(struct phrasehashtable* table) { } #endif // GDBM static int isskippedspace(const char c) { return isspace(c) || c == '>' || c == '_'; // || c == '<'; } static void replacespaces(char* str) { char* p; for (p = str; *p && isskippedspace(*p); p++); if (!*p) { while (*str) { while (isskippedspace(*str) && isskippedspace(str[1])) strcpy(str, str + 1); if (isskippedspace(*str)) *str = '_'; str++; } } } int replaceescapes(char* p) { int numspaces = 0; while (*p) { if (!strncmp(p, "=20", 3)) { p[0] = p[1] = p[2] = ' '; numspaces += 3; } p++; } return numspaces; } int onlyspaces(const char* p, int pastheader, int pure) { char lastch; lastch = ' '; while (*p) { if (pure) { if (!isspace(*p)) return 0; } else if (!isskippedspace(*p)) return 0; if (lastch == '\n') { if (*p == '\n') return 0; if (!pastheader) return 0; } lastch = *p; p++; } if (lastch == '\n' && !pastheader) return 0; return 1; } static void grabphonenumber(char* betweenwords, int startoff, int* betweencnt) { int j; int k; int l; #ifndef LEAVEDIGITS int numdigits; #endif if (startoff < 0) { return; } // find first digit for (j = startoff; !isdigit(betweenwords[j]); j++); // find last digit for (l = *betweencnt - 1; !isdigit(betweenwords[l]); l--); betweenwords[*betweencnt] = '\0'; // delete all non-digits for (numdigits = 0, k = j; k <= l; ) { if (isdigit(betweenwords[k])) { #ifndef LEAVEDIGITS if (k == j || k == l) { // fold digit for ifile betweenwords[k] = (betweenwords[k] - '0') + 'a'; } else { numdigits++; } #endif } else { char* p = betweenwords + k; while (*p) { *p = p[1]; p++; } l--; continue; } k++; } // reset *betweencnt *betweencnt = strlen(betweenwords); #ifdef LEAVEDIGITS if (j == startoff || betweenwords[j - 1] != 'x') { // insert an 'x' just before first digit for (k = *betweencnt; k > j; k--) { betweenwords[k] = betweenwords[k - 1]; } (*betweencnt)++; betweenwords[j] = 'x'; } // find last digit again for (l = *betweencnt - 1; !isdigit(betweenwords[l]); l--); // insert an 'x' just after last digit for (k = *betweencnt; k > l; k--) { betweenwords[k] = betweenwords[k - 1]; } (*betweencnt)++; betweenwords[l + 1] = 'x'; #endif betweenwords[*betweencnt] = '\0'; } int main(int argc, char* argv[]) { int c; int lastch; int charcnt; int betweencnt; int verbose; struct wordhasharray* words[2] = {0, 0}; char lastword[14]; char betweenwords[BUFSIZ * 2]; static struct wordhashtable hashtable; static struct phrasehashtable phrasehashtable; int i; int doclen; int column; int lastcolumn; int ac; char** av; int rflag; int nflag; int pastheader; int prevwasonlyspaces; int ateof; AspellSpeller *speller; AspellConfig *speller_config; int runofdigits; int contigdigits; int firstdigit; char firstnum; int initnumwords; int initnumphrases; int spacessincenewline; ac = argc; av = argv; inithashtable(&hashtable); initphrasehashtable(&phrasehashtable); nflag = rflag = 0; while (ac > 1) { if (!strcmp(av[1], "-r")) { ac--; av++; rflag++; } else if (!strcmp(av[1], "-n")) { ac--; av++; nflag++; } else break; } speller_config = new_aspell_config(); aspell_config_replace(speller_config, "lang", "en_US"); aspell_config_replace(speller_config, "ignore-case", "true"); AspellCanHaveError* possible_err = new_aspell_speller(speller_config); if (aspell_error_number(possible_err) != 0) { fputs("phrases: ", stderr); fputs(aspell_error_message(possible_err), stderr); fputc('\n', stderr); } speller = to_aspell_speller(possible_err); initnumwords = 0; initnumphrases = 0; if (ac > 1) { readphrases(speller, &hashtable, &phrasehashtable, av[1], 0); initnumwords = hashtable.numents; initnumphrases = phrasehashtable.numents; } verbose = 0; firstnum = firstdigit = runofdigits = betweencnt = charcnt = 0; contigdigits = 0; spacessincenewline = 0; lastword[0] = betweenwords[0] = '\0'; ateof = prevwasonlyspaces = pastheader = column = i = 0; lastch = ' '; if (nflag) pastheader = 1; // if it's not a message, don't differentiate between header and body while (!ateof) { c = getchar(); if (c == EOF) { ateof = 1; c = '.'; } if (isalpha(c)) { if ((runofdigits == 0 || runofdigits > 2) && firstdigit != -1 && contigdigits) { grabphonenumber(betweenwords, firstdigit, &betweencnt); } contigdigits = firstnum = runofdigits = 0; firstdigit = -1; column++; if (charcnt < 13) { lastword[charcnt] = c; } else if (charcnt >= 13) { if (charcnt == 13) { lastword[13] = '\0'; int len = strlen(lastword); if (betweencnt + len > sizeof betweenwords) { printf("%s", betweencnt); betweencnt = 0; } betweenwords[betweencnt] = '\0'; // printf("appending %s\n", lastword); strcat(betweenwords, lastword); betweencnt = strlen(betweenwords); lastword[0] = '\0'; } if (column >= 1024 && spacessincenewline == 0) { betweenwords[betweencnt] = ' '; betweencnt++; column++; spacessincenewline++; } betweenwords[betweencnt] = c; betweencnt++; } charcnt++; } else { int wasonlyspaces = isskippedspace(c); int putspaceafter = 0; if (c == '\n' && (betweencnt == 0 || lastch != '=')) { column = 0; spacessincenewline = 0; } else { column++; } betweenwords[betweencnt] = '\0'; if (betweencnt - 1 >= sizeof betweenwords) { printf("%s", betweenwords); betweencnt = 0; } if (charcnt > 2 && charcnt < 13) { spacessincenewline += replaceescapes(betweenwords); wasonlyspaces = onlyspaces(betweenwords, pastheader, 0); if (charcnt < 13) lastword[charcnt] = '\0'; else lastword[13] = '\0'; words[1] = addword(speller, &hashtable, lastword, 0, rflag); if (wasonlyspaces && i > 0) { //printf("lastcolumn = %d\n", // lastcolumn); struct phrasehashent* phrase; phrase = addphrase(speller, &phrasehashtable, words, rflag); //if ((lastcolumn != 5 || // strcmp(words[0]->orig, // "From") || // strcmp(betweenwords, " ")) // ) { // "3.24..." because it is an // estimate of alpha // http://en.wikipedia.org/wiki/Pareto_distribution#Parameter_estimation // "4" because that is in the 1% quantile // see http://www.math.uah.edu/stat/special/Pareto.html // perl -e 'print // 1.0/((1-.99)**(1.0/3.2441218159))' if (phrase->initfreq > 4 && (lastcolumn != 5 || strcmp(words[0]->orig, "From")) && c != '@' && c != '_' && c != ':' ) { replacespaces(betweenwords); } } if (i > 0) { if (words[0] && words[0]->orig) { printf("%s", words[0]->orig); } #ifdef COMPLEX if (pastheader && prevwasonlyspaces && (betweenwords[0] == '.' || betweenwords[0] == '!' || betweenwords[0] == '?') && c != '-') { putchar(' '); spacessincenewline++; } #endif } words[0] = words[1]; i++; lastcolumn = column; printf("%s", betweenwords); firstdigit = betweencnt = 0; if (column > 1024 && spacessincenewline == 0) { betweenwords[betweencnt] = ' '; betweencnt++; column++; spacessincenewline++; } else if ((!strcmp(lastword, "http") || !strcmp(lastword, "https")) && c == ':') { betweenwords[betweencnt] = ' '; betweencnt++; column++; spacessincenewline++; } } else { lastword[charcnt < 13 ? charcnt : 13] = '\0'; betweenwords[(sizeof betweenwords) - 1 - 13] = '\0'; strcat(betweenwords, lastword); betweencnt = strlen(betweenwords); } if (isdigit(c) && pastheader) { if (runofdigits == 0) { if (isspace(lastch) || lastch == '(' || lastch == '-' || lastch == '.') { runofdigits++; } if (!contigdigits) { firstnum = c; firstdigit = betweencnt; } } else { if (runofdigits > 1) { contigdigits = 1; } runofdigits++; } } else if (runofdigits > 0) { if ((runofdigits == 1 && firstnum != '1') || runofdigits < 3) { contigdigits = 0; firstdigit = -1; } runofdigits = 0; } //lastword[charcnt < 13 ? charcnt: 13] = '\0'; //printf("lastword = %s\n", lastword); //printf("pastheader = %d\n", pastheader); // if (betweencnt > 0 && c == '\n' && lastch == '=') { if (column < 1024 || spacessincenewline > 0) { betweenwords[betweencnt - 1] = '\0'; column--; } else { betweenwords[betweencnt] = c; betweencnt++; column = 0; spacessincenewline = 0; } } else if (c == ')' || c == '(' || c == '?' || c == '[' || c == ']' #ifdef COMPLEX || c == '"' #endif ) { #ifdef COMPLEX if ((c == '"') && !isalpha(lastch)) { betweenwords[betweencnt - 1] = ' '; spacessincenewline++; } #endif betweenwords[betweencnt] = ' '; betweencnt++; column++; spacessincenewline++; } else if ((c == '.' || c == '!' || c == '?') && prevwasonlyspaces && onlyspaces(betweenwords, 1, 0)) { betweenwords[betweencnt] = ' '; betweencnt++; column++; spacessincenewline++; putspaceafter++; } if (pastheader && !spacessincenewline && (isspace(c) && (lastch == ':' || lastch == ';'))) { c = lastch; } else if (!pastheader || !(c == '\n' && lastch == '=')) { if (pastheader && (c == '<' || c == ',' || (c == '>' && lastch != ' ') || (c == '/' && (lastch == '<' || lastch == ' ')))) { betweenwords[betweencnt] = ' '; spacessincenewline++; } else if (c == '"') { betweenwords[betweencnt] = '\''; } else { betweenwords[betweencnt] = c; } betweencnt++; } if (c == '(' || c == ')' || c == '[' || c == ']' #ifdef COMPLEX || c == '"' #endif ) { betweenwords[betweencnt] = ' '; betweencnt++; column++; spacessincenewline++; #ifdef COMPLEX } else if (c == '\'' && !isalpha(lastch)) { betweenwords[betweencnt] = ' '; betweencnt++; column++; spacessincenewline++; #endif } else if (putspaceafter) { betweenwords[betweencnt] = ' '; betweencnt++; column++; spacessincenewline++; } lastword[0] = '\0'; charcnt = 0; if (lastch == '\n' && c == '\n') pastheader = 1; if (c != '\n' && isskippedspace(c)) { spacessincenewline++; } prevwasonlyspaces = wasonlyspaces; } lastch = c; } printf("%s", lastword); if (words[0] && words[0]->orig) { printf("%s", words[0]->orig); } if ((runofdigits == 0 || runofdigits > 2) && firstdigit != -1 && contigdigits) { grabphonenumber(betweenwords, firstdigit, &betweencnt); } betweenwords[betweencnt] = '\0'; printf("%s", betweenwords); doclen = i; //dumphashtable(&hashtable); // dumpdocument(document, i); putchar('\n'); fflush(stderr); fflush(stdout); if (ac > 1 && !rflag && (hashtable.numents != initnumwords || phrasehashtable.numents != initnumphrases)) { writephrases(&phrasehashtable, av[1]); } closephrases(&phrasehashtable); return 0; }