/* Copyright (C) 2014  Robert, robert@dummy.us.eu.org */

/*
  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License
  as published by the Free Software Foundation; either version 2
  of the License, or (at your option) any later version.
  
  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  
  You should have received a copy of the GNU General Public License
  along with this program; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <aspell.h>
#ifdef GDBM
#include <gdbm.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <time.h>
#endif

extern char threechars[][4];

int _IO_stderr_;	/* don't under why this is necessary */

#define HASHTABLESIZ	524288 // 262144 // 65536
#define	MAXWORDS	262144 // 65536

#define	BOB	1  /* http://burtleburtle.net/bob/hash/hashfaq.html#unique
		      Hash Function FAQ */

static int findprefix(const char* word)
{
        int     i;

        for (i = 1; threechars[i][0]; i++) {
                int     v = strncmp(word, threechars[i], 3);

                if (!v) {
			return 1;
		}
                if (v < 0)
                        return 0;
        }
        return 0;
}

#ifdef GDBM
struct wordhasharray {
	int	freq;
	unsigned long	hash;
	char*	str;
	char*	orig;
	struct phrasehashent* phrase;
	int	misspelled;
};

struct wordhashtable {
	GDBM_FILE	gdbm;
	int		numents;
	struct phrasehashent* phrase;
};

struct phrasehashent {
	struct wordhashtable*	table;
	int			freq;
	int			initfreq;
};

struct phrasehashtable {
	struct phrasehashent	table;
	int		numents;
};

static void inithashtable(struct wordhashtable *table)
{
	table->numents = 0;
	table->phrase = NULL;
}

static void initphrasehashtable(struct phrasehashtable *table)
{
	table->numents = 0;
	table->table.table = NULL;
}

static void gdbm_fatal()
{
	fprintf(stderr, "gdbm fatal\n");
	exit(1);
}

static void cleanphrasehashtable(AspellSpeller* speller,
				 struct phrasehashtable *table)
{
	datum	key;

	key = gdbm_firstkey(table->table.table->gdbm);
	while (key.dptr) {
		datum nextkey;
		datum content;

		nextkey = gdbm_nextkey(table->table.table->gdbm, key);
		content = gdbm_fetch(table->table.table->gdbm, key);
		if (content.dsize == 1 && content.dptr[0] <= '1') {
			gdbm_delete(table->table.table->gdbm, key);
			free(content.dptr);
		} else {
			char contentstr[BUFSIZ];
			int freq;

			strncpy(contentstr, content.dptr, content.dsize);
			contentstr[content.dsize] = '\0';
			freq = atoi(contentstr);
			freq--;
			free(content.dptr);
			sprintf(contentstr, "%d", freq);
			content.dptr = contentstr;
			content.dsize = strlen(contentstr);
			gdbm_store(table->table.table->gdbm, key, content,
				   GDBM_REPLACE);
		}
		free(key.dptr);
		key = nextkey;
	}
	gdbm_reorganize(table->table.table->gdbm);
}

static int timetocheck(const char* fn)
{
	struct stat	st;
	int		ret = 0;

	if (stat(fn, &st) == 0) {
		time_t now;
		struct tm tm;

		time(&now);
		if (localtime_r(&now, &tm)) {
			time_t saturday;

			saturday = now - tm.tm_sec - (tm.tm_min * 60)
				- (tm.tm_hour * 60 * 60)
				- (((tm.tm_wday + 1) % 7) * 60 * 60 * 24);
			if (st.st_mtime < saturday) {
				ret = 1;
			}
		}
	}
	return ret;
}

static void readphrases(AspellSpeller* speller,
			struct wordhashtable *hashtable,
			struct phrasehashtable* table, const char* fn,
                        int rflag)
{
	char	newfn[BUFSIZ];
	int	needscleaning = 0;

	strcpy(newfn, fn);
	strcat(newfn, ".db");
	if (!rflag) {
		needscleaning = timetocheck(newfn);
	}
	for (int times = 0; times < 10; times++) {
		hashtable->gdbm = gdbm_open(newfn, BUFSIZ,
					    rflag
					      ? GDBM_READER
					      : GDBM_WRCREAT, 0644,
					    gdbm_fatal);
		if (hashtable->gdbm) {
			break;
		}
		sleep(2);
	}
	if (hashtable->gdbm == NULL) {
		int	c;
		fprintf(stderr, "phrases: cannot open %s: %s\n", newfn,
			gdbm_strerror(gdbm_errno));
		while ((c = getchar()) != EOF) {
			putchar(c);
		}
		exit(1);
	}
	table->table.table = hashtable;
	hashtable->phrase = &table->table;
	if (needscleaning) {
		cleanphrasehashtable(speller, table);
	}
}

static void closephrases(struct phrasehashtable* table)
{
	gdbm_close(table->table.table->gdbm);
	table->table.table->gdbm = 0;
}

static struct wordhasharray* addword(AspellSpeller* speller,
				     struct wordhashtable *table,
                                     const char* word, int initialization,
				     int rflag)
{
	struct wordhasharray*	arr;
	arr = (struct wordhasharray*)malloc(sizeof *arr);
	arr->freq = arr->hash = 0;
	arr->phrase = table->phrase;
	arr->str = arr->orig = strdup(word);
	arr->misspelled = !aspell_speller_check(speller, word, strlen(word));
}

static struct phrasehashent* addphrase(AspellSpeller* speller,
				       struct phrasehashtable* table,
                                       struct wordhasharray** array,
                                       int rflag)
{
	datum key;
	char keystr[BUFSIZ];
	datum content;

	strcpy(keystr, array[0]->str);
	strcat(keystr, "\t");
	strcat(keystr, array[1]->str);
	key.dptr = keystr;
	key.dsize = strlen(keystr);
	content = gdbm_fetch(table->table.table->gdbm, key);
	// IMPROVEMENT
	// store phrase in memory, along with freq and increment when in
	// read-only mode; this will make it behave like the old version
	if (content.dptr) {
		char contentstr[BUFSIZ];

		strncpy(contentstr, content.dptr, content.dsize);
		contentstr[content.dsize] = '\0';
		table->table.freq = table->table.initfreq = atoi(contentstr);
		table->table.freq++;
		free(content.dptr);
		if (!rflag) {
			sprintf(contentstr, "%d", table->table.freq);
			content.dptr = contentstr;
			content.dsize = strlen(contentstr);
			gdbm_store(table->table.table->gdbm, key, content,
				   GDBM_REPLACE);
		}
	} else {
		if (!rflag) {
			content.dptr = "1";
			content.dsize = 1;
			gdbm_store(table->table.table->gdbm, key, content,
				   GDBM_REPLACE);
		}
		table->table.freq = table->table.initfreq = 1;
		table->numents++;
	}
	return &table->table;
}

static void writephrases(struct phrasehashtable* table, const char* fn)
{
}
#else // GDBM
struct wordhashtable {
	struct wordhasharray* table[HASHTABLESIZ];
	int		numents;
	struct wordhasharray array[MAXWORDS];
	int		numwords;
};

struct phrasehashent {
	unsigned long hash;
	int	freq;
	int	initfreq;
	int	len;
	struct wordhasharray** words;
	struct phrasehashent** prev;
	struct phrasehashent** next;
};

struct phrasehashtable {
	struct phrasehashent table[HASHTABLESIZ];
	int		numents;
};

static unsigned long hashword(const char* word)
{
	const char*	p;
	unsigned long	hash;

	hash = 0;
	for (p = word; *p; p++) {
#ifdef BOB	/* see above */
		hash += tolower(*p);
		hash += (hash<<10);
		hash ^= (hash>>6);
#else
		hash += (hash << 2) + tolower(*p);
#endif
	}
	return(hash);
}

struct sorthasharray {
        float freq;
        struct wordhasharray* ent;
};

static int sorthasharray(const void* a, const void* b)
{
	struct sorthasharray* aa = (struct sorthasharray*)a;
	struct sorthasharray* bb = (struct sorthasharray*)b;

#if 0
	return(aa->freq > bb->freq ? 1 : (aa->freq < bb->freq ? -1 : 0));
#else
	return(aa->freq < bb->freq ? 1 : (aa->freq > bb->freq ? -1 : 0));
#endif
}

static void cleanhashtable(AspellSpeller* speller,
			   struct wordhashtable *table);

static int hashtablelookup1(struct wordhashtable* table,
			    unsigned long hash, int usepointer,
			    const char* word, int allowfail,
                            int rflag)
{
	int	orig_ent;
	int	ent;

	//printf("hashtablelookup: hash=%lu\n", hash);
	orig_ent = ent = hash % HASHTABLESIZ;
	while (table->table[ent]) {
		if (table->table[ent]->hash == hash
		    && ((usepointer && table->table[ent]->str == word)
                        || !strcmp(table->table[ent]->str, word))) {
			// printf("hashtablelookup: found existing word\n");
			break;
		}
		ent = (ent + 1) % HASHTABLESIZ;
		if (ent == orig_ent) {
                        if (allowfail) {
                                ent = -1;
                        }
                        break;
		}
	}
	return(ent);
}

static int hashtablelookup(struct wordhashtable* table,
			   unsigned long *hash, int usepointer,
			   const char* word, int rflag)
{
	*hash = hashword(word);
        if (!strcmp(word, "(null)")) {
                word = 0;
        }
	return(hashtablelookup1(table, *hash, usepointer, word, 1, rflag));
}

static void cleanhashtable(AspellSpeller* speller, struct wordhashtable *table)
{
	struct sorthasharray array[MAXWORDS];
	unsigned int	i;
	unsigned int	ent;
	unsigned int	ent1;
	unsigned int	orig_ent1;
        unsigned int    numfound;

	for (i = 0; i < (sizeof array) / (sizeof *array); i++) {
#if 0
                float wordfact;
                int   freq;
                int   times;
                struct phrasehashent* phrase;
		int   len;

                wordfact = 1;
                if (!findprefix(table->array[i].str)) {
                        wordfact = 0.5;
		}
		if (table->array[i].misspelled ||
		    !aspell_speller_check(speller,
					  table->array[i].str,
					  len = strlen(table->array[i].str))) {
                        wordfact *= 0.5;
			table->array[i].misspelled = 1;
		}
                for (times = 0, freq = table->array[i].freq,
                        phrase = table->array[i].phrase;
                     phrase && times < table->array[i].freq; times++) {
                        freq += phrase->freq;
                        if (phrase->words != 0) {
                                if (phrase->words[0] == table->array + i)
                                        phrase = phrase->next[0];
                                else if (phrase->words[1] == table->array + i)
                                        phrase = phrase->next[1];
                                else
                                        phrase = 0;
                        } else
                                phrase = 0;
                }
                array[i].freq = freq * wordfact;
#else
                array[i].freq = table->array[i].freq;
#endif
		array[i].ent = table->array + i;
        }
	qsort(array, (sizeof array) / (sizeof *array), sizeof *array,
	      sorthasharray);
	for (numfound = i = 0;
#if 0  // seems to prevent infinite loops
             numfound < ((sizeof array) / (sizeof *array)) / 8
                    && i < ((sizeof array) / (sizeof *array))
#else
                    i < ((sizeof array) / (sizeof *array)) / 8
#endif
             ;
             i++) {
                //struct phrasehashent* phrase;
                unsigned long	hash;

		ent = hashtablelookup1(table, array[i].ent->hash, 1,
                                       array[i].ent->str, 0, 0);
		if (table->table[ent]) {
                        int last_ent1;

                        numfound++;
			table->table[ent] = NULL;
			for (last_ent1 = orig_ent1 = ent,
                                ent1 = (ent + 1) % HASHTABLESIZ;
			     table->table[ent1]
				&& table->table[ent1]->hash % HASHTABLESIZ == ent;
			     ent1 = (ent1 + 1) % HASHTABLESIZ) {
				table->table[(ent1 + HASHTABLESIZ - 1) % HASHTABLESIZ] = table->table[ent1];
                                last_ent1 = ent1;
				if (ent1 == orig_ent1)
					break;
			}
			table->table[last_ent1] = NULL;
		}
		free(array[i].ent->str);
                fprintf(stderr, "phrases: deleting '%s' (freq=%f)\n",
                       array[i].ent->orig, array[i].freq);
		free(array[i].ent->orig);
                hash = array[i].ent->hash;
                //phrase = array[i].ent->phrase;
		memset(array[i].ent, 0, sizeof *array[i].ent);
                //array[i].ent->phrase = phrase;
                array[i].ent->hash = hash;
		table->numents--;
	}
}

static int findfreeword(AspellSpeller* speller, struct wordhashtable* table)
{
	int	i;

	if (table->numwords < MAXWORDS
            && table->array[table->numwords].freq == 0) {
                return(table->numwords++);
	}
	for (;;) {
		for (i = 0; table->array[i].freq != 0 && i < MAXWORDS; i++);
		if (i < MAXWORDS) {
			return(i);
		}
                fprintf(stderr, "findfreeword: hash table full\n");
		cleanhashtable(speller, table);
	}
	return(0);
}

static struct wordhasharray* addword(AspellSpeller* speller,
				     struct wordhashtable *table,
                                     const char* word, int initialization,
				     int rflag)
{
	int		wordent;
	int		ent;
	unsigned long	hash;
	char		localcopy[BUFSIZ];
	const char*	local = localcopy;
	char*		p;
	const char*	q;

	if (initialization) {
		local = word;
	} else {
		for (p = localcopy, q = word; *q; q++, p++)
			*p = tolower(*q);
		*p = '\0';
	}
        ent = -1;
        while (ent < 0) {
                ent = hashtablelookup(table, &hash, 0, local, rflag);
                if (ent >= 0)
                        break;
                //fprintf(stderr, "phrases: hashtablelookup1: hash table full\n");
                cleanhashtable(speller, table);
        }
	if (table->table[ent]) {
		// printf("addword: found %s\n", local);
		table->table[ent]->freq++;
		if (strcmp(table->table[ent]->orig, word)) {
			if (table->table[ent]->orig != table->table[ent]->str) {
				free(table->table[ent]->orig);
			}
			table->table[ent]->orig = strdup(word);
		}
	} else {
		wordent = findfreeword(speller, table);
		memset(&table->array[wordent], 0,
		       sizeof table->array[wordent]);
		table->array[wordent].freq = 1;
		table->array[wordent].hash = hash;
                if (!strcmp(local, "(null)")) {
                        table->array[wordent].str = 0;
                } else {
                        table->array[wordent].str = strdup(local);
                }
		if (word == local || !strcmp(local, word)) {
			table->array[wordent].orig
				= table->array[wordent].str;
		} else {
			table->array[wordent].orig = strdup(word);
		}
		// printf("addword: wordent=%d, %s\n", wordent, local); 
		table->table[ent] = table->array + wordent;
		table->numents++;
	}
	return(table->table[ent]);
}

static void inithashtable(struct wordhashtable *table)
{
	memset(table, 0, sizeof *table);
}

static void initphrasehashtable(struct phrasehashtable *table)
{
	memset(table, 0, sizeof *table);
}

static void dumphashtable(struct wordhashtable *table)
{
	int	i;

	for (i = 0; i < HASHTABLESIZ; i++) {
		if (table->table[i])
			printf("%s %d\n", table->table[i]->str,
			       table->table[i]->freq);
	}
}

static void dumpdocument(struct wordhasharray* const* array, int num)
{
	int	i;

	for (i = 0; i < num; i++) {
		printf("%s %d", array[i]->str, array[i]->freq);
		printf(" -- phrase %s %s %d\n",
		       array[i]->phrase->words[0]->str, 
		       array[i]->phrase->words[1]->str,
		       array[i]->phrase->freq);
	}
}

static unsigned long hashaddrs(const char* word, int len)
{
	const char*	p;
	unsigned long	hash;

	hash = 0;
	for (p = word; p - word < len; p++) {
#ifdef BOB	/* see above */
		hash += tolower(*p);
		hash += (hash<<10);
		hash ^= (hash>>6);
#else
		hash += (hash << 2) + tolower(*p);
#endif
	}
	return(hash);
}

struct sortphrases {
        float freq;
        struct phrasehashent* ent;
};

static int sortphrasehashents(const void* a, const void* b)
{
        struct sortphrases* aa = (struct sortphrases*)a;
        struct sortphrases* bb = (struct sortphrases*)b;

#if 0
	return(aa->ent->words
                    && (aa->ent->words[0]->str == 0
                        || aa->ent->words[1]->str == 0) ? -1
               : (bb->ent->words
                    && (bb->ent->words[0]->str == 0
                        || bb->ent->words[1]->str == 0) ? 1
               : (aa->freq > bb->freq ? 1 : (aa->freq < bb->freq ? -1 : 0))));
#else
	return(aa->freq > bb->freq ? 1 : (aa->freq < bb->freq ? -1 : 0));
#endif
}

static int findphrase1(AspellSpeller* speller,
		       struct phrasehashtable* table, unsigned long hash,
		       struct wordhasharray* const* array,
                       int canclean, int rflag);

static void cleanphrasehashtable(AspellSpeller* speller,
				 struct phrasehashtable *table)
{
	struct sortphrases phrases[MAXWORDS];
	unsigned int	i;
	unsigned int	ent;
	unsigned int	ent1;
	unsigned int	orig_ent1;
        unsigned int    numfound;

	for (i = 0; i < (sizeof phrases) / (sizeof *phrases); i++) {
                float   word1fact;
                float   word2fact;

                word1fact = word2fact = 1;
                if (table->table[i].words[0]->str == 0)
                        word1fact = 0;
                else {
		    int len;

		    if (!findprefix(table->table[i].words[0]->str)) {
			    word1fact *= 0.25;
			    table->table[i].words[0]->misspelled = 1;
		    } else if (table->table[i].words[0]->misspelled
			 || !aspell_speller_check(speller,
						  table->table[i].words[0]->str,
						  len = strlen(table->table[i].words[0]->str))) {
			    word1fact *= 0.5;
			    table->table[i].words[0]->misspelled = 1;
		    }
		}
                if (table->table[i].words[1]->str == 0)
                        word2fact = 0;
                else {
		    int len;

		    if (!findprefix(table->table[i].words[1]->str)) {
			    word2fact *= 0.25;
			    table->table[i].words[1]->misspelled = 1;
		    } else if (table->table[i].words[1]->misspelled
			 || !aspell_speller_check(speller,
						  table->table[i].words[1]->str,
						  len = strlen(table->table[i].words[1]->str))) {
			    word2fact *= 0.5;
			    table->table[i].words[1]->misspelled = 1;
		    }
		}
                phrases[i].freq =
                    table->table[i].freq * word1fact +
		    table->table[i].freq * word2fact;
                //    (float)(table->table[i].freq * word1fact) / table->table[i].words[0]->freq +
                //    (float)(table->table[i].freq * word2fact) / table->table[i].words[1]->freq;
                phrases[i].ent = table->table + i;
        }
	qsort(phrases, (sizeof phrases) / (sizeof *phrases), sizeof *phrases,
	      sortphrasehashents);
	for (numfound = i = 0;
#if 1  // seems to prevent infinite loops
             numfound < ((sizeof phrases) / (sizeof *phrases)) / 64
                    && i < ((sizeof phrases) / (sizeof *phrases))
#else
                    i < ((sizeof phrases) / (sizeof *phrases)) / 64
#endif
             ;) {
                if (phrases[i].ent->words) {
                        ent = findphrase1(speller,
					  table, phrases[i].ent->hash,
					  phrases[i].ent->words, 0, 0);
        		if (table->table[ent].words) {
                                int last_ent1;
                                unsigned long saved_last_ent_hash;
                                struct phrasehashent** saved_last_ent_next;
                                struct phrasehashent** saved_last_ent_prev;
                                unsigned long saved_hash;
                                struct phrasehashent** saved_next;
                                struct phrasehashent** saved_prev;
        
                                numfound++;
                                saved_last_ent_hash = saved_hash
                                    = table->table[ent].hash;
                                saved_last_ent_next = saved_next
                                    = table->table[ent].next;
                                saved_last_ent_prev = saved_prev
                                    = table->table[ent].prev;
                                //fprintf(stderr, "phrases: eliminating '%s %s' (freq=%f)\n",
                                //        table->table[ent].words[0]
                                //            ? table->table[ent].words[0]->orig
                                //            : "(unknown)",
                                //        table->table[ent].words[1]
                                //            ? table->table[ent].words[1]->orig
                                //            : "(unknown)",
                                //        phrases[i].freq);
                                if (table->table[ent].words[0]) {
                                        if (table->table[ent].words[0]->freq > 0) {
                                                table->table[ent].words[0]->freq--;
                                        }
                                }
                                if (table->table[ent].words[1]) {
                                        if (table->table[ent].words[1]->freq > 0) {
                                                table->table[ent].words[1]->freq--;
                                        }
                                }
        			memset(&table->table[ent], 0,
                                       sizeof table->table[ent]);
                                last_ent1 = ent;
        			for (orig_ent1 = ent1,
                                        ent1 = (ent + 1) % HASHTABLESIZ;
        			     table->table[ent1].words
        				&& table->table[ent1].hash % HASHTABLESIZ == ent;
        			     ent1 = (ent1 + 1) % HASHTABLESIZ) {
        				table->table[(ent1 + HASHTABLESIZ - 1) % HASHTABLESIZ] = table->table[ent1];
                                        last_ent1 = ent1;
                                        saved_last_ent_hash = table->table[ent1].hash;
                                        saved_last_ent_next = table->table[ent1].next;
                                        saved_last_ent_prev = table->table[ent1].prev;
        				if (ent1 == orig_ent1)
        					break;
        			}
        			memset(&table->table[last_ent1], 0,
                                       sizeof table->table[last_ent1]);
                                table->table[last_ent1].hash = saved_last_ent_hash;
                                table->table[last_ent1].next = saved_last_ent_next;
                                table->table[last_ent1].prev = saved_last_ent_prev;
                                table->table[ent].hash = saved_hash;
                                table->table[ent].next = saved_next;
                                table->table[ent].prev = saved_prev;
                                continue;
        		}
                }
                i++;
	}
	for (; i < (sizeof phrases) / (sizeof *phrases); i++) {
                // attempt to phase-out old phrases
                if (phrases[i].ent->freq > 1)
                        phrases[i].ent->freq--;
        }
}

static int findphrase1(AspellSpeller* speller,
		       struct phrasehashtable* table, unsigned long hash,
		       struct wordhasharray* const* array,
                       int canclean, int rflag)
{
	int	orig_ent;
	int	ent;

	orig_ent = ent = hash % HASHTABLESIZ;
	while (table->table[ent].words) {
		//printf("findphrase1: %s\n", table->table[ent].words[0]->str);
		if (table->table[ent].hash == hash
		    && !memcmp(table->table[ent].words, array,
			       (sizeof *array) * 2)) {
			break;
		}
		ent = (ent + 1) % HASHTABLESIZ;
		if (orig_ent == ent) {
                        if (!rflag) {
                                if (canclean) {
                                        //fprintf(stderr, "phrases: phrase hash table full\n");
                                        cleanphrasehashtable(speller, table);
                                } else {
                                        fprintf(stderr, "phrases: findphrase1: phrase hash table full; clobbering entry\n");
                                        break;
                                }
                        } else {
                                break;
                        }
		}
	}
	return(ent);
}

static int findphrase(AspellSpeller* speller,
		      struct phrasehashtable* table, unsigned long* hash,
		      struct wordhasharray* const* array, int rflag)
{
	*hash = hashaddrs((const char*)array, (sizeof *array) * 2);
	return(findphrase1(speller, table, *hash, array, 1, rflag));
}

static struct phrasehashent* addphrase(AspellSpeller* speller,
				       struct phrasehashtable* table,
                                       struct wordhasharray** array,
                                       int rflag)
{
	int	ent;
	unsigned long hash;

	ent = findphrase(speller, table, &hash, array, rflag);
	if (rflag) {
		return table->table + ent;
	}
	if (table->table[ent].words) {
		table->table[ent].freq = table->table[ent].initfreq + 1;
	} else {
		table->table[ent].initfreq = 0;
		table->table[ent].freq = 1;
		table->table[ent].len = 2;
		table->table[ent].hash = hash;
		table->table[ent].words = (struct wordhasharray**)calloc(2,
								sizeof *array);
		memcpy(table->table[ent].words, array,
		       (sizeof *array) * 2);
		table->numents++;
		table->table[ent].prev
                        = (struct phrasehashent**)calloc(2,
                                              sizeof *table->table[ent].prev);
		table->table[ent].next
                        = (struct phrasehashent**)calloc(2,
                                              sizeof *table->table[ent].next);
                table->table[ent].next[0] = array[0]->phrase;
                if (array[0]->phrase)
                        array[0]->phrase->prev[0] = table->table + ent;
		array[0]->phrase = table->table + ent;
                if (array[0] != array[1]) {
                        table->table[ent].next[1] = array[1]->phrase;
                        if (array[1]->phrase)
                                array[1]->phrase->prev[1] = table->table + ent;
                        array[1]->phrase = table->table + ent;
                }
		//printf("addphrase: new phrase %s %s\n",
		//       table->table[ent].words[0]->str,
		//       table->table[ent].words[1]->str);
	}
        return table->table + ent;
}

static void readphrases(AspellSpeller* speller,
			struct wordhashtable *hashtable,
			struct phrasehashtable* table, const char* fn,
                        int rflag)
{
	FILE*	fp;
	char	buf[BUFSIZ];
	char	str1[BUFSIZ];
	char	str2[BUFSIZ];
	struct wordhasharray* words[2];
	int	freq;
        struct phrasehashent* phrase;

	if ((fp = fopen(fn, "r"))) {
		while (fgets(buf, sizeof buf, fp)) {
			if (sscanf(buf, "%d\t%s\t%s\n", &freq,
				   str1, str2) == 3) {
                                words[0] = addword(speller,
						   hashtable, str1, 1, rflag);
                                words[1] = addword(speller,
						   hashtable, str2, 1, rflag);
				phrase = addphrase(speller,
						   table, words, rflag);
				phrase->freq = phrase->initfreq = freq;
			}
		}
		fclose(fp);
	} else {
		perror(fn);
	}
}

static void writephrases(struct phrasehashtable* table, const char* fn)
{
	FILE*	fp;
	int	i;

	if ((fp = fopen(fn, "w"))) {
		for (i = 0; i < HASHTABLESIZ; i++) {
			if (table->table[i].freq > 0
                            && table->table[i].words[0]->str != 0
                            && table->table[i].words[1]->str != 0) {
				fprintf(fp, "%d\t%s\t%s\n",
					table->table[i].freq,
					table->table[i].words[0]->str,
					table->table[i].words[1]->str);
			}
		}
		fclose(fp);
	} else {
		perror(fn);
	}
}

static void closephrases(struct phrasehashtable* table)
{
}
#endif // GDBM

static int isskippedspace(const char c)
{
	return isspace(c) || c == '>' || c == '_'; // || c == '<';
}

static void replacespaces(char* str)
{
	char* p;

	for (p = str; *p && isskippedspace(*p); p++);
	if (!*p) {
		while (*str) {
			while (isskippedspace(*str) && isskippedspace(str[1]))
				strcpy(str, str + 1);
			if (isskippedspace(*str))
				*str = '_';
			str++;
		}
	}
}

int replaceescapes(char* p)
{
	int numspaces = 0;

	while (*p) {
		if (!strncmp(p, "=20", 3)) {
			p[0] = p[1] = p[2] = ' ';
			numspaces += 3;
		}
		p++;
	}
	return numspaces;
}

int onlyspaces(const char* p, int pastheader, int pure)
{
        char lastch;

        lastch = ' ';
	while (*p) {
		if (pure) {
			if (!isspace(*p))
				return 0;
		} else if (!isskippedspace(*p))
			return 0;
                if (lastch == '\n') {
                        if (*p == '\n')
                                return 0;
                        if (!pastheader)
                                return 0;
                }
                lastch = *p;
		p++;
	}
        if (lastch == '\n' && !pastheader)
                return 0;
	return 1;
}

static void grabphonenumber(char* betweenwords, int startoff, int* betweencnt)
{
	int	j;
	int	k;
	int	l;
#ifndef LEAVEDIGITS
	int	numdigits;
#endif

	if (startoff < 0) {
		return;
	}
	// find first digit
	for (j = startoff; !isdigit(betweenwords[j]); j++);
	// find last digit
	for (l = *betweencnt - 1;
	     !isdigit(betweenwords[l]); l--);
	betweenwords[*betweencnt] = '\0';
	// delete all non-digits
	for (numdigits = 0, k = j; k <= l; ) {
		if (isdigit(betweenwords[k])) {
#ifndef LEAVEDIGITS
			if (k == j || k == l) {
				// fold digit for ifile
				betweenwords[k] = (betweenwords[k] - '0') + 'a';
			} else {
				numdigits++;
			}
#endif
		} else {
		        char* p = betweenwords + k;
			while (*p) {
				*p = p[1];
				p++;
			}
			l--;
			continue;
		}
		k++;
	}
	// reset *betweencnt
	*betweencnt = strlen(betweenwords);
#ifdef LEAVEDIGITS
	if (j == startoff || betweenwords[j - 1] != 'x') {
        	// insert an 'x' just before first digit
        	for (k = *betweencnt; k > j; k--) {
        		betweenwords[k] = betweenwords[k - 1];
        	}
        	(*betweencnt)++;
        	betweenwords[j] = 'x';
	}
      	// find last digit again
	for (l = *betweencnt - 1;
	     !isdigit(betweenwords[l]); l--);
	// insert an 'x' just after last digit
	for (k = *betweencnt; k > l; k--) {
		betweenwords[k] = betweenwords[k - 1];
	}
	(*betweencnt)++;
	betweenwords[l + 1] = 'x';
#endif
	betweenwords[*betweencnt] = '\0';
}

int main(int argc, char* argv[])
{
	int		c;
        int             lastch;
	int		charcnt;
	int		betweencnt;
	int		verbose;
	struct wordhasharray* words[2] = {0, 0};
	char		lastword[14];
	char		betweenwords[BUFSIZ * 2];
	static struct wordhashtable hashtable;
	static struct phrasehashtable phrasehashtable;
	int		i;
	int		doclen;
	int		column;
	int		lastcolumn;
	int		ac;
	char**		av;
	int		rflag;
	int		nflag;
        int             pastheader;
        int             prevwasonlyspaces;
        int             ateof;
	AspellSpeller	*speller;
	AspellConfig	*speller_config;
	int		runofdigits;
	int		contigdigits;
	int		firstdigit;
	char		firstnum;
	int		initnumwords;
	int		initnumphrases;
	int		spacessincenewline;

	ac = argc;
	av = argv;
	inithashtable(&hashtable);
	initphrasehashtable(&phrasehashtable);
	nflag = rflag = 0;
	while (ac > 1) {
		if (!strcmp(av[1], "-r")) {
			ac--;
			av++;
			rflag++;
		} else if (!strcmp(av[1], "-n")) {
			ac--;
			av++;
			nflag++;
		} else
			break;
	}
	speller_config = new_aspell_config();
	aspell_config_replace(speller_config, "lang", "en_US");
	aspell_config_replace(speller_config, "ignore-case", "true");
	AspellCanHaveError* possible_err = new_aspell_speller(speller_config);
	if (aspell_error_number(possible_err) != 0) {
	    fputs("phrases: ", stderr);
	    fputs(aspell_error_message(possible_err), stderr);
	    fputc('\n', stderr);
	}
	speller = to_aspell_speller(possible_err);
	initnumwords = 0;
	initnumphrases = 0;
	if (ac > 1) {
		readphrases(speller,
			    &hashtable, &phrasehashtable, av[1], 0);
		initnumwords = hashtable.numents;
		initnumphrases = phrasehashtable.numents;
	}
	verbose = 0;
	firstnum = firstdigit = runofdigits = betweencnt = charcnt = 0;
	contigdigits = 0;
	spacessincenewline = 0;
	lastword[0] = betweenwords[0] = '\0';
	ateof = prevwasonlyspaces = pastheader = column = i = 0;
        lastch = ' ';
        if (nflag)
                pastheader = 1;     // if it's not a message, don't differentiate between header and body
	while (!ateof) {
		c = getchar();
		if (c == EOF) {
			ateof = 1;
			c = '.';
		}
		if (isalpha(c)) {
			if ((runofdigits == 0 || runofdigits > 2)
			    && firstdigit != -1 && contigdigits) {
				grabphonenumber(betweenwords, firstdigit,
						&betweencnt);
			}
			contigdigits = firstnum = runofdigits = 0;
			firstdigit = -1;
			column++;
			if (charcnt < 13) {
				lastword[charcnt] = c;
			} else if (charcnt >= 13) {
				if (charcnt == 13) {
					lastword[13] = '\0';
					int len = strlen(lastword);

					if (betweencnt + len > sizeof betweenwords) {
						printf("%s", betweencnt);
						betweencnt = 0;
					}
					betweenwords[betweencnt] = '\0';
					// printf("appending %s\n", lastword);
					strcat(betweenwords, lastword);
					betweencnt = strlen(betweenwords);
					lastword[0] = '\0';
				}
				if (column >= 1024 && spacessincenewline == 0) {
					betweenwords[betweencnt] = ' ';
					betweencnt++;
					column++;
					spacessincenewline++;
				}
				betweenwords[betweencnt] = c;
				betweencnt++;
			}
			charcnt++;
		} else {
                        int wasonlyspaces = isskippedspace(c);
			int putspaceafter = 0;

			if (c == '\n' && (betweencnt == 0 || lastch != '=')) {
				column = 0;
				spacessincenewline = 0;
                        } else {
				column++;
                        }
                        betweenwords[betweencnt] = '\0';
			if (betweencnt - 1 >= sizeof betweenwords) {
				printf("%s", betweenwords);
				betweencnt = 0;
			}
			if (charcnt > 2 && charcnt < 13) {
				spacessincenewline
					+= replaceescapes(betweenwords);
                                wasonlyspaces = onlyspaces(betweenwords,
                                                           pastheader, 0);
				if (charcnt < 13)
					lastword[charcnt] = '\0';
				else
					lastword[13] = '\0';
				words[1] = addword(speller,
						   &hashtable, lastword, 0,
                                                   rflag);
                                if (wasonlyspaces && i > 0) {
					//printf("lastcolumn = %d\n",
					//       lastcolumn);
					struct phrasehashent* phrase;

                                        phrase = addphrase(speller,
							   &phrasehashtable,
                                                           words, rflag);
					//if ((lastcolumn != 5 ||
					//	strcmp(words[0]->orig,
					//		  "From") ||
					//	strcmp(betweenwords, " "))
                                        //    ) {
					// "3.24..." because it is an
					//	estimate of alpha
					// http://en.wikipedia.org/wiki/Pareto_distribution#Parameter_estimation
					// "4" because that is in the 1% quantile
					// see http://www.math.uah.edu/stat/special/Pareto.html
					// perl -e 'print
					//	1.0/((1-.99)**(1.0/3.2441218159))'
					if (phrase->initfreq > 4
					    && (lastcolumn != 5 ||
						strcmp(words[0]->orig,
						       "From"))
					    && c != '@' && c != '_'
					    && c != ':'
                                            ) {
						replacespaces(betweenwords);
					}
                                }
                                if (i > 0) {
					if (words[0] && words[0]->orig) {
						printf("%s", words[0]->orig);
					}
#ifdef COMPLEX
                                        if (pastheader && prevwasonlyspaces &&
                                            (betweenwords[0] == '.' ||
                                             betweenwords[0] == '!' ||
                                             betweenwords[0] == '?')
                                            && c != '-') {
                                                putchar(' ');
						spacessincenewline++;
					}
#endif
                                }
				words[0] = words[1];
                                i++;
				lastcolumn = column;
				printf("%s", betweenwords);
				firstdigit = betweencnt = 0;
				if (column > 1024 && spacessincenewline == 0) {
					betweenwords[betweencnt] = ' ';
					betweencnt++;
					column++;
					spacessincenewline++;
				} else if ((!strcmp(lastword, "http") || !strcmp(lastword, "https"))  && c == ':') {
					betweenwords[betweencnt] = ' ';
					betweencnt++;
					column++;
					spacessincenewline++;
                                }
			} else {
				lastword[charcnt < 13 ? charcnt : 13] = '\0';
				betweenwords[(sizeof betweenwords) - 1 - 13] = '\0';
				strcat(betweenwords, lastword);
				betweencnt = strlen(betweenwords);
			}
			if (isdigit(c) && pastheader) {
				if (runofdigits == 0) {
					if (isspace(lastch)
					    || lastch == '('
					    || lastch == '-'
					    || lastch == '.') {
						runofdigits++;
					}
					if (!contigdigits) {
						firstnum = c;
						firstdigit = betweencnt;
					}
				} else {
					if (runofdigits > 1) {
						contigdigits = 1;
					}
					runofdigits++;
				}
			} else if (runofdigits > 0) {
				if ((runofdigits == 1
				     && firstnum != '1')
				    || runofdigits < 3) {
					contigdigits = 0;
					firstdigit = -1;
				}
				runofdigits = 0;
			}
			//lastword[charcnt < 13 ? charcnt: 13] = '\0';
			//printf("lastword = %s\n", lastword);
			//printf("pastheader = %d\n", pastheader);
			//
			if (betweencnt > 0 && c == '\n' && lastch == '=') {
				if (column < 1024 || spacessincenewline > 0) {
					betweenwords[betweencnt - 1] = '\0';
					column--;
				} else {
					betweenwords[betweencnt] = c;
					betweencnt++;
					column = 0;
					spacessincenewline = 0;
				}
			} else if (c == ')' || c == '(' || c == '?'
				   || c == '[' || c == ']'
#ifdef COMPLEX
				   || c == '"'
#endif
				   ) {
#ifdef COMPLEX
				if ((c == '"')
				    && !isalpha(lastch)) {
					betweenwords[betweencnt - 1] = ' ';
					spacessincenewline++;
				}
#endif
				betweenwords[betweencnt] = ' ';
				betweencnt++;
				column++;
				spacessincenewline++;
			} else if ((c == '.' || c == '!' || c == '?')
				   && prevwasonlyspaces
				   && onlyspaces(betweenwords, 1, 0)) {
				betweenwords[betweencnt] = ' ';
				betweencnt++;
				column++;
				spacessincenewline++;
				putspaceafter++;
			}
			if (pastheader && !spacessincenewline &&
			    (isspace(c) && (lastch == ':' || lastch == ';'))) {
				c = lastch;
			} else if (!pastheader
				   || !(c == '\n' && lastch == '=')) {
				if (pastheader
				    && (c == '<' || c == ','
					|| (c == '>'
					    && lastch != ' ')
					|| (c == '/'
					    && (lastch == '<'
						|| lastch == ' ')))) {
					betweenwords[betweencnt] = ' ';
					spacessincenewline++;
				} else if (c == '"') {
					betweenwords[betweencnt] = '\'';
				} else {
					betweenwords[betweencnt] = c;
				}
				betweencnt++;
			}
			if (c == '(' || c == ')' || c == '[' || c == ']'
#ifdef COMPLEX
			    || c == '"'
#endif
			    ) {
				betweenwords[betweencnt] = ' ';
				betweencnt++;
				column++;
				spacessincenewline++;
#ifdef COMPLEX
			} else if (c == '\'' && !isalpha(lastch)) {
				betweenwords[betweencnt] = ' ';
				betweencnt++;
				column++;
				spacessincenewline++;
#endif
			} else if (putspaceafter) {
				betweenwords[betweencnt] = ' ';
				betweencnt++;
				column++;
				spacessincenewline++;
			}
			lastword[0] = '\0';
			charcnt = 0;
			if (lastch == '\n' && c == '\n')
                                pastheader = 1;
			if (c != '\n' && isskippedspace(c)) {
				spacessincenewline++;
			}
                        prevwasonlyspaces = wasonlyspaces;
		}
                lastch = c;
	}
        printf("%s", lastword);
	if (words[0] && words[0]->orig) {
		printf("%s", words[0]->orig);
	}
	if ((runofdigits == 0 || runofdigits > 2) && firstdigit != -1
	    && contigdigits) {
		grabphonenumber(betweenwords, firstdigit, &betweencnt);
	}
	betweenwords[betweencnt] = '\0';
	printf("%s", betweenwords);
	doclen = i;
	//dumphashtable(&hashtable);
	// dumpdocument(document, i);
	putchar('\n');
	fflush(stderr);
	fflush(stdout);
	if (ac > 1 && !rflag
	    && (hashtable.numents != initnumwords
		|| phrasehashtable.numents != initnumphrases)) {
		writephrases(&phrasehashtable, av[1]);
	}
	closephrases(&phrasehashtable);
	return 0;
}