|
|
1.1 ! root 1: #include "tdef.h" ! 2: #include "fns.h" ! 3: #include "ext.h" ! 4: ! 5: #define HY_BIT 0200 /* stuff in here only works for 7-bit ascii */ ! 6: /* this value is used (as a literal) in suftab.c */ ! 7: /* to encode possible hyphenation points in suffixes. */ ! 8: /* it could be changed, by widening the tables */ ! 9: /* to be shorts instead of chars. */ ! 10: ! 11: /* ! 12: * troff8.c ! 13: * ! 14: * hyphenation ! 15: */ ! 16: ! 17: char hbuf[NHEX]; ! 18: char *nexth = hbuf; ! 19: Tchar *hyend; ! 20: ! 21: #define THRESH 160 /* digram goodness threshold */ ! 22: int thresh = THRESH; ! 23: ! 24: int texhyphen(void); ! 25: static int alpha(Tchar); ! 26: ! 27: void hyphen(Tchar *wp) ! 28: { ! 29: int j; ! 30: Tchar *i; ! 31: ! 32: i = wp; ! 33: while (punct((*i++))) ! 34: ; ! 35: if (!alpha(*--i)) ! 36: return; ! 37: wdstart = i++; ! 38: while (alpha(*i++)) ! 39: ; ! 40: hyend = wdend = --i - 1; ! 41: while (punct((*i++))) ! 42: ; ! 43: if (*--i) ! 44: return; ! 45: if (wdend - wdstart < 4) /* 4 chars is too short to hyphenate */ ! 46: return; ! 47: hyp = hyptr; ! 48: *hyp = 0; ! 49: hyoff = 2; ! 50: ! 51: /* for now, try exceptions first, then tex (if hyphalg is non-zero), ! 52: then suffix and digram if tex didn't hyphenate it at all. ! 53: */ ! 54: ! 55: if (!exword() && !texhyphen() && !suffix()) ! 56: digram(); ! 57: ! 58: /* this appears to sort hyphenation points into increasing order */ ! 59: *hyp++ = 0; ! 60: if (*hyptr) ! 61: for (j = 1; j; ) { ! 62: j = 0; ! 63: for (hyp = hyptr + 1; *hyp != 0; hyp++) { ! 64: if (*(hyp - 1) > *hyp) { ! 65: j++; ! 66: i = *hyp; ! 67: *hyp = *(hyp - 1); ! 68: *(hyp - 1) = i; ! 69: } ! 70: } ! 71: } ! 72: } ! 73: ! 74: static alpha(Tchar i) /* non-zero if really alphabetic */ ! 75: { ! 76: if (ismot(i)) ! 77: return 0; ! 78: else if (cbits(i) >= ALPHABET) /* this isn't very elegant, but there's */ ! 79: return 0; /* no good way to make sure i is in range for */ ! 80: else /* the call of isalpha */ ! 81: return isalpha(cbits(i)); ! 82: } ! 83: ! 84: ! 85: punct(Tchar i) ! 86: { ! 87: if (!i || alpha(i)) ! 88: return(0); ! 89: else ! 90: return(1); ! 91: } ! 92: ! 93: ! 94: void caseha(void) /* set hyphenation algorithm */ ! 95: { ! 96: hyphalg = HYPHALG; ! 97: if (skip()) ! 98: return; ! 99: noscale++; ! 100: hyphalg = atoi0(); ! 101: noscale = 0; ! 102: } ! 103: ! 104: ! 105: void caseht(void) /* set hyphenation threshold; not in manual! */ ! 106: { ! 107: thresh = THRESH; ! 108: if (skip()) ! 109: return; ! 110: noscale++; ! 111: thresh = atoi0(); ! 112: noscale = 0; ! 113: } ! 114: ! 115: ! 116: void casehw(void) ! 117: { ! 118: int i, k; ! 119: char *j; ! 120: Tchar t; ! 121: ! 122: k = 0; ! 123: while (!skip()) { ! 124: if ((j = nexth) >= hbuf + NHEX - 2) ! 125: goto full; ! 126: for (; ; ) { ! 127: if (ismot(t = getch())) ! 128: continue; ! 129: i = cbits(t); ! 130: if (i == ' ' || i == '\n') { ! 131: *j++ = 0; ! 132: nexth = j; ! 133: *j = 0; ! 134: if (i == ' ') ! 135: break; ! 136: else ! 137: return; ! 138: } ! 139: if (i == '-') { ! 140: k = HY_BIT; ! 141: continue; ! 142: } ! 143: *j++ = maplow(i) | k; ! 144: k = 0; ! 145: if (j >= hbuf + NHEX - 2) ! 146: goto full; ! 147: } ! 148: } ! 149: return; ! 150: full: ! 151: ERROR "exception word list full." WARN; ! 152: *nexth = 0; ! 153: } ! 154: ! 155: ! 156: int exword(void) ! 157: { ! 158: Tchar *w; ! 159: char *e, *save; ! 160: ! 161: e = hbuf; ! 162: while (1) { ! 163: save = e; ! 164: if (*e == 0) ! 165: return(0); ! 166: w = wdstart; ! 167: while (*e && w <= hyend && (*e & 0177) == maplow(cbits(*w))) { ! 168: e++; ! 169: w++; ! 170: } ! 171: if (!*e) { ! 172: if (w-1 == hyend || (w == wdend && maplow(cbits(*w)) == 's')) { ! 173: w = wdstart; ! 174: for (e = save; *e; e++) { ! 175: if (*e & HY_BIT) ! 176: *hyp++ = w; ! 177: if (hyp > hyptr + NHYP - 1) ! 178: hyp = hyptr + NHYP - 1; ! 179: w++; ! 180: } ! 181: return(1); ! 182: } else { ! 183: e++; ! 184: continue; ! 185: } ! 186: } else ! 187: while (*e++) ! 188: ; ! 189: } ! 190: } ! 191: ! 192: ! 193: suffix(void) ! 194: { ! 195: Tchar *w; ! 196: char *s, *s0; ! 197: Tchar i; ! 198: extern char *suftab[]; ! 199: ! 200: again: ! 201: i = cbits(*hyend); ! 202: if (!alpha(i)) ! 203: return(0); ! 204: if (i < 'a') ! 205: i -= 'A' - 'a'; ! 206: if ((s0 = suftab[i-'a']) == 0) ! 207: return(0); ! 208: for (;;) { ! 209: if ((i = *s0 & 017) == 0) ! 210: return(0); ! 211: s = s0 + i - 1; ! 212: w = hyend - 1; ! 213: while (s > s0 && w >= wdstart && (*s & 0177) == maplow(cbits(*w))) { ! 214: s--; ! 215: w--; ! 216: } ! 217: if (s == s0) ! 218: break; ! 219: s0 += i; ! 220: } ! 221: s = s0 + i - 1; ! 222: w = hyend; ! 223: if (*s0 & HY_BIT) ! 224: goto mark; ! 225: while (s > s0) { ! 226: w--; ! 227: if (*s-- & HY_BIT) { ! 228: mark: ! 229: hyend = w - 1; ! 230: if (*s0 & 0100) /* 0100 used in suftab to encode something too */ ! 231: continue; ! 232: if (!chkvow(w)) ! 233: return(0); ! 234: *hyp++ = w; ! 235: } ! 236: } ! 237: if (*s0 & 040) ! 238: return(0); ! 239: if (exword()) ! 240: return(1); ! 241: goto again; ! 242: } ! 243: ! 244: ! 245: maplow(int i) ! 246: { ! 247: if (isupper(i)) ! 248: i = tolower(i); ! 249: return(i); ! 250: } ! 251: ! 252: ! 253: vowel(int i) ! 254: { ! 255: switch (i) { ! 256: case 'a': case 'A': ! 257: case 'e': case 'E': ! 258: case 'i': case 'I': ! 259: case 'o': case 'O': ! 260: case 'u': case 'U': ! 261: case 'y': case 'Y': ! 262: return(1); ! 263: default: ! 264: return(0); ! 265: } ! 266: } ! 267: ! 268: ! 269: Tchar *chkvow(Tchar *w) ! 270: { ! 271: while (--w >= wdstart) ! 272: if (vowel(cbits(*w))) ! 273: return(w); ! 274: return(0); ! 275: } ! 276: ! 277: ! 278: void digram(void) ! 279: { ! 280: Tchar *w; ! 281: int val; ! 282: Tchar *nhyend, *maxw; ! 283: int maxval; ! 284: extern char bxh[26][13], bxxh[26][13], xxh[26][13], xhx[26][13], hxx[26][13]; ! 285: ! 286: again: ! 287: if (!(w = chkvow(hyend + 1))) ! 288: return; ! 289: hyend = w; ! 290: if (!(w = chkvow(hyend))) ! 291: return; ! 292: nhyend = w; ! 293: maxval = 0; ! 294: w--; ! 295: while (++w < hyend && w < wdend - 1) { ! 296: val = 1; ! 297: if (w == wdstart) ! 298: val *= dilook('a', cbits(*w), bxh); ! 299: else if (w == wdstart + 1) ! 300: val *= dilook(cbits(*(w-1)), cbits(*w), bxxh); ! 301: else ! 302: val *= dilook(cbits(*(w-1)), cbits(*w), xxh); ! 303: val *= dilook(cbits(*w), cbits(*(w+1)), xhx); ! 304: val *= dilook(cbits(*(w+1)), cbits(*(w+2)), hxx); ! 305: if (val > maxval) { ! 306: maxval = val; ! 307: maxw = w + 1; ! 308: } ! 309: } ! 310: hyend = nhyend; ! 311: if (maxval > thresh) ! 312: *hyp++ = maxw; ! 313: goto again; ! 314: } ! 315: ! 316: ! 317: dilook(int a, int b, char t[26][13]) ! 318: { ! 319: int i, j; ! 320: ! 321: i = t[maplow(a)-'a'][(j = maplow(b)-'a')/2]; ! 322: if (!(j & 01)) ! 323: i >>= 4; ! 324: return(i & 017); ! 325: } ! 326: ! 327: ! 328: /* here beginneth the tex hyphenation code, as interpreted freely */ ! 329: /* the main difference is that there is no attempt to squeeze space */ ! 330: /* as tightly at tex does. */ ! 331: ! 332: static int texit(Tchar *, Tchar *); ! 333: static int readpats(void); ! 334: static void install(char *); ! 335: static void fixup(void); ! 336: static int trieindex(int, int); ! 337: ! 338: static char pats[50000]; /* size ought to be computed dynamically */ ! 339: static char *nextpat = pats; ! 340: static char *trie[27*27]; /* english-specific sizes */ ! 341: ! 342: int texhyphen(void) ! 343: { ! 344: static int loaded = 0; /* -1: couldn't find tex file */ ! 345: ! 346: if (hyphalg == 0 || loaded == -1) /* non-zero => tex for now */ ! 347: return 0; ! 348: if (loaded == 0) { ! 349: if (readpats()) ! 350: loaded = 1; ! 351: else ! 352: loaded = -1; ! 353: } ! 354: return texit(wdstart, wdend); ! 355: } ! 356: ! 357: static int texit(Tchar *start, Tchar *end) /* hyphenate as in tex, return # found */ ! 358: { ! 359: int nw, i, k, eq, cnt[500]; ! 360: char w[500+1], *np, *pp, *wp, *xpp, *xwp; ! 361: ! 362: w[0] = '.'; ! 363: for (nw = 1; start <= end && nw < 500-1; nw++, start++) ! 364: w[nw] = maplow(tolower(cbits(*start))); ! 365: start -= (nw - 1); ! 366: w[nw++] = '.'; ! 367: w[nw] = 0; ! 368: /* ! 369: * printf("try %s\n", w); ! 370: */ ! 371: for (i = 0; i <= nw; i++) ! 372: cnt[i] = '0'; ! 373: ! 374: for (wp = w; wp < w + nw; wp++) { ! 375: for (pp = trie[trieindex(*wp, *(wp+1))]; pp < nextpat; ) { ! 376: if (pp == 0 /* no trie entry */ ! 377: || *pp != *wp /* no match on 1st letter */ ! 378: || *(pp+1) != *(wp+1)) /* no match on 2nd letter */ ! 379: break; /* so move to next letter of word */ ! 380: eq = 1; ! 381: for (xpp = pp+2, xwp = wp+2; *xpp; ) ! 382: if (*xpp++ != *xwp++) { ! 383: eq = 0; ! 384: break; ! 385: } ! 386: if (eq) { ! 387: np = xpp+1; /* numpat */ ! 388: for (k = wp-w; *np; k++, np++) ! 389: if (*np > cnt[k]) ! 390: cnt[k] = *np; ! 391: /* ! 392: * printf("match: %s %s\n", pp, xpp+1); ! 393: */ ! 394: } ! 395: pp += *(pp-1); /* skip over pattern and numbers to next */ ! 396: } ! 397: } ! 398: /* ! 399: * for (i = 0; i < nw; i++) printf("%c", w[i]); ! 400: * printf(" "); ! 401: * for (i = 0; i <= nw; i++) printf("%c", cnt[i]); ! 402: * printf("\n"); ! 403: */ ! 404: /* ! 405: * for (i = 1; i < nw - 1; i++) { ! 406: * if (i > 2 && i < nw - 3 && cnt[i] % 2) ! 407: * printf("-"); ! 408: * if (cbits(start[i-1]) != '.') ! 409: * printf("%c", cbits(start[i-1])); ! 410: * } ! 411: * printf("\n"); ! 412: */ ! 413: for (i = 1; i < nw -1; i++) ! 414: if (i > 2 && i < nw - 3 && cnt[i] % 2) ! 415: *hyp++ = start + i - 1; ! 416: return hyp - hyptr; /* non-zero if a hyphen was found */ ! 417: } ! 418: ! 419: /* ! 420: This code assumes that hyphen.tex looks like ! 421: % some comments ! 422: \patterns{ % more comments ! 423: pat5ter4ns, 1 per line, SORTED, nothing else ! 424: } ! 425: more goo ! 426: \hyphenation{ % more comments ! 427: ex-cep-tions, one per line; i ignore this part for now ! 428: } ! 429: ! 430: this code is NOT robust against variations. unfortunately, ! 431: it looks like every local language version of this file has ! 432: a different format. i have also made no provision for weird ! 433: characters. sigh. ! 434: */ ! 435: ! 436: static int readpats(void) ! 437: { ! 438: FILE *fp; ! 439: char buf[200], buf1[200]; ! 440: ! 441: if ((fp = fopen(TEXHYPHENS, "r")) == NULL ! 442: && (fp = fopen(ALTHYPHENS, "r")) == NULL) { ! 443: ERROR "warning: can't find hyphen.tex" WARN; ! 444: return 0; ! 445: } ! 446: ! 447: while (fgets(buf, sizeof buf, fp) != NULL) { ! 448: sscanf(buf, "%s", buf1); ! 449: if (strcmp(buf1, "\\patterns{") == 0) ! 450: break; ! 451: } ! 452: while (fgets(buf, sizeof buf, fp) != NULL) { ! 453: if (buf[0] == '}') ! 454: break; ! 455: install(buf); ! 456: } ! 457: fclose(fp); ! 458: fixup(); ! 459: return 1; ! 460: } ! 461: ! 462: static void install(char *s) /* map ab4c5de to: 12 abcde \0 00405 \0 */ ! 463: { ! 464: int npat, lastpat; ! 465: char num[500], *onextpat = nextpat; ! 466: ! 467: num[0] = '0'; ! 468: *nextpat++ = ' '; /* fill in with count later */ ! 469: for (npat = lastpat = 0; *s != '\n' && *s != '\0'; s++) { ! 470: if (isdigit(*s)) { ! 471: num[npat] = *s; ! 472: lastpat = npat; ! 473: } else { ! 474: *nextpat++ = *s; ! 475: npat++; ! 476: num[npat] = '0'; ! 477: } ! 478: } ! 479: *nextpat++ = 0; ! 480: if (nextpat > pats + sizeof(pats)-20) { ! 481: ERROR "tex hyphenation table overflow, tail end ignored" WARN; ! 482: nextpat = onextpat; ! 483: } ! 484: num[lastpat+1] = 0; ! 485: strcat(nextpat, num); ! 486: nextpat += strlen(nextpat) + 1; ! 487: } ! 488: ! 489: static void fixup(void) /* build indexes of where . a b c ... start */ ! 490: { ! 491: char *p, *lastc; ! 492: int n; ! 493: ! 494: for (lastc = pats, p = pats+1; p < nextpat; p++) ! 495: if (*p == ' ') { ! 496: *lastc = p - lastc; ! 497: lastc = p; ! 498: } ! 499: *lastc = p - lastc; ! 500: for (p = pats+1; p < nextpat; ) { ! 501: n = trieindex(p[0], p[1]); ! 502: if (trie[n] == 0) ! 503: trie[n] = p; ! 504: p += p[-1]; ! 505: } ! 506: /* printf("pats = %d\n", nextpat - pats); */ ! 507: } ! 508: ! 509: static int trieindex(int d1, int d2) ! 510: { ! 511: return 27 * (d1 == '.' ? 0 : d1 - 'a' + 1) + (d2 == '.' ? 0 : d2 - 'a' + 1); ! 512: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.