/* Copyright (C) 2003 Robert, dummy@csoft.net */ /* This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include #include #include #include extern "C" { #include }; extern char threechars[][4]; int _IO_stderr_; /* don't under why this is necessary */ #define BOB 1 /* http://burtleburtle.net/bob/hash/hashfaq.html#unique Hash Function FAQ */ static float student(float percent, int num) { int which; double p; double q; double t; double df; int status; double bound; which = 2; p = percent / 100; q = 1 - p; df = num - 1; bound = 0; cdft(&which, &p, &q, &t, &df, &status, &bound); return t; } static char binhex[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="; static int matchthreechar(char c, int charcnt, int lastmatch) { int i; int orig_i; //printf("c='%c',charcnt=%d\n", c, charcnt); for (orig_i = i = lastmatch?lastmatch:1; threechars[i][0]; i++) { if (threechars[i][charcnt] == c) { //printf("return(%d) (\"%s\")\n", i, threechars[i]); return(i); } if (threechars[i][charcnt] > c) { //printf("return(0)\n"); return(0); } if (charcnt > 0 && strncmp(threechars[orig_i], threechars[i], charcnt)) { //printf("return(0)\n"); return(0); } } return(0); } int main(int argc, char* argv[]) { int c; unsigned long hash; int charcnt; int i; unsigned long buckets[128]; unsigned long uberbuckets[128]; unsigned long total; unsigned long ubertotal; int num; int ubernum; unsigned int bits; long lower; long upper; long lowermean; long uppermean; long uberlower; long uberupper; long uberlowermean; long uberuppermean; int verbose; float percent; #define DEBUGWORDS 1 #ifdef DEBUGWORDS char lastword[14]; #endif int lastmatched; int outputhex; int ac; int prevchar; int skipdigitstuff; int minwordlen; int combine; verbose = 0; outputhex = 1; percent = 60; minwordlen = 3; for (ac = 1; ac < argc; ac++) { if (argv[ac][0] == '.' || isdigit(argv[ac][0])) { percent = atof(argv[ac]); } else if (!strcmp(argv[ac], "-v")) { verbose = 1; } else if (!strcmp(argv[ac], "-w")) { minwordlen = atoi(argv[ac + 1]); ac++; } else if (!strcmp(argv[ac], "-b")) { outputhex = 0; } } hash = 0; charcnt = 0; for (i = 0; i < 128; i++) { uberbuckets[i] = buckets[i] = 0; } lastmatched = 1; prevchar = '\0'; skipdigitstuff = 0; ubertotal = total = 0; while ((c = getchar()) != EOF) { if (skipdigitstuff) { if (isalnum(c)) continue; skipdigitstuff = 0; } if (isalpha(c)) { #ifdef BOB /* see above */ hash += tolower(c); hash += (hash<<10); hash ^= (hash>>6); #else hash += (hash << 2) + tolower(c); #endif if (lastmatched && charcnt < 3) { lastmatched = matchthreechar(tolower(c), charcnt, lastmatched); } #ifdef DEBUGWORDS if (charcnt < 13) lastword[charcnt] = c; #endif charcnt++; } else { if (lastmatched && !isdigit(c)) { uberbuckets[hash % 128]++; ubertotal++; if (charcnt > minwordlen && charcnt < 13) { buckets[hash % 128]++; total++; #ifdef DEBUGWORDS if (charcnt < 13) lastword[charcnt] = '\0'; else lastword[13] = '\0'; if (verbose) { printf("lastword = %s\n", lastword); } #endif } } charcnt = 0; hash = 0; lastmatched = 1; } if (isdigit(c)) skipdigitstuff = 1; prevchar = c; } //if (total > 127 /*|| percent < 61*/) { combine = 1; //} else if (total > 63) { // combine = 2; //} else if (total > 31) { // combine = 4; //} else if (total > 15) { // combine = 8; //} else if (total > 7) { // combine = 16; //} else if (total > 3) { // combine = 32; //} else { // combine = 64; //} for (ubernum = num = i = 0; i < 128; i += combine) { int j; int subtotal; int ubersubtotal; for (ubersubtotal = subtotal = j = 0; j < combine; j++) { subtotal += buckets[i + j]; ubersubtotal += uberbuckets[i + j]; } if (subtotal != 0) { num++; // "total" is now calculated above // total += buckets[i]; } if (ubersubtotal != 0) { ubernum++; // "total" is now calculated above // total += buckets[i]; } } if (num > 2) { double stddev; double pstddev; long mean; double uberstddev; double uberpstddev; long ubermean; float confid; mean = total / 128; ubermean = ubertotal / 128; if (verbose) { printf("ubertotal = %lu, total = %lu\n", ubertotal, total); } // assume Poisson distribution pstddev = sqrt((double)total / 128); uberpstddev = sqrt((double)ubertotal / 128); for (ubertotal = total = i = 0; i < 128; i += combine) { int j; int subtotal; int ubersubtotal; for (ubersubtotal = subtotal = j = 0; j < combine; j++) { subtotal += buckets[i + j]; ubersubtotal += uberbuckets[i + j]; } //if (subtotal != 0) { total += (subtotal - mean) * (subtotal - mean); ubertotal += (ubersubtotal - ubermean) * (ubersubtotal - ubermean); //} } stddev = sqrt((double)total / 128); uberstddev = sqrt((double)ubertotal / 128); confid = student(percent, 128); lowermean = mean - (long)(confid * stddev / sqrt(128 - 1) + 0.5); uppermean = mean + (long)(confid * stddev / sqrt(128 - 1) + 0.5); uberlowermean = ubermean - (long)(confid * uberstddev / sqrt(128 - 1) + 0.5); uberuppermean = ubermean + (long)(confid * uberstddev / sqrt(128 - 1) + 0.5); //lower = mean - (long)(confid * stddev + 0.5); //upper = mean + (long)(confid * stddev + 0.5); // Poisson distribution stuff lower = mean - (long)(confid * pstddev + 0.5); upper = mean + (long)(confid * pstddev + 0.5); uberlower = mean - (long)(confid * uberpstddev + 0.5); uberupper = mean + (long)(confid * uberpstddev + 0.5); if (verbose) { printf("confid = %g, mean = %ld, poisson stddev = %g, stddev = %g\n", confid, mean, pstddev, stddev); } } else { uberlowermean = uberuppermean = uberlower = uberupper = upper = lower = lowermean = uppermean = 0; } if (verbose) { // printf("between %d and %d\n", lower, upper); //printf("greater than %d (mean = %d, stddev = %d, num = %d, ubertotal = %d, total = %d)\n", // lowermean, mean, stddev, confid, 128, ubertotal, total); printf("greater than %ld or lower than %ld (num = %d)\n", upper, lower, 128); printf("lowermean = %ld, uppermean = %ld\n", lowermean, uppermean); printf("ubertotal = %ld, total = %ld\n", ubertotal, total); printf("ubernum = %d, num = %d\n", ubernum, num); printf("combine = %d\n", combine); for (i = 0; i < 128; i += combine) { int j; int subtotal; for (subtotal = j = 0; j < combine; j++) { subtotal += buckets[i + j]; } for (j = 0; j < combine; j++) { printf(" %d", subtotal); } } putchar('\n'); for (i = 0; i < 128; i += combine) { int j; int ubersubtotal; for (ubersubtotal = j = 0; j < combine; j++) { ubersubtotal += uberbuckets[i + j]; } for (j = 0; j < combine; j++) { printf(" %d", ubersubtotal); } } putchar('\n'); } for (bits = i = 0; i < 128; i += combine) { int j; int subtotal; int ubersubtotal; for (ubersubtotal = subtotal = j = 0; j < combine; j++) { subtotal += buckets[i + j]; ubersubtotal += uberbuckets[i + j]; } for (j = 0; j < combine; j++) { if (num < 2) { if ((long)ubersubtotal < uberlower || (long)ubersubtotal > uberupper) { bits |= 1 << ((outputhex ? 4 : 6) - ((i + j) % (outputhex ? 4 : 6)) - 1); } } else { if ((long)subtotal < lower || (long)subtotal > upper) { bits |= 1 << ((outputhex ? 4 : 6) - ((i + j) % (outputhex ? 4 : 6)) - 1); } } if (subtotal > 0) { //if ((long)subtotal <= lower || (long)subtotal > upper) { // bits |= 1 << ((outputhex ? 4 : 6) - ((i + j) % (outputhex ? 4 : 6)) - 1); //} //if ((long)subtotal >= lowermean) { // bits |= 1 << ((i + j) % (outputhex ? 4 : 6)); //} } //if ((long)subtotal > upper) { // bits |= 1 << ((outputhex ? 4 : 6) - ((i + j) % (outputhex ? 4 : 6)) - 1); //} if ((i + j) % (outputhex ? 4 : 6) == (outputhex ? 4 : 6) - 1) { if (outputhex) printf("%x", bits); else printf("%c", binhex[bits]); bits = 0; } } } if (!outputhex) printf("%c", binhex[bits]); putchar('\n'); fflush(stderr); fflush(stdout); return 0; }