|
|
1.1 root 1: #include "tdef.h"
2: #include "fns.h"
3: #include "ext.h"
4:
5: #define HY_BIT 0200 /* stuff in here only works for 7-bit ascii */
6: /* this value is used (as a literal) in suftab.c */
7: /* to encode possible hyphenation points in suffixes. */
8: /* it could be changed, by widening the tables */
9: /* to be shorts instead of chars. */
10:
11: /*
12: * troff8.c
13: *
14: * hyphenation
15: */
16:
17: char hbuf[NHEX];
18: char *nexth = hbuf;
19: Tchar *hyend;
20:
21: #define THRESH 160 /* digram goodness threshold */
22: int thresh = THRESH;
23:
24: int texhyphen(void);
25: static int alpha(Tchar);
26:
27: void hyphen(Tchar *wp)
28: {
29: int j;
30: Tchar *i;
31:
32: i = wp;
33: while (punct((*i++)))
34: ;
35: if (!alpha(*--i))
36: return;
37: wdstart = i++;
38: while (alpha(*i++))
39: ;
40: hyend = wdend = --i - 1;
41: while (punct((*i++)))
42: ;
43: if (*--i)
44: return;
45: if (wdend - wdstart < 4) /* 4 chars is too short to hyphenate */
46: return;
47: hyp = hyptr;
48: *hyp = 0;
49: hyoff = 2;
50:
51: /* for now, try exceptions first, then tex (if hyphalg is non-zero),
52: then suffix and digram if tex didn't hyphenate it at all.
53: */
54:
55: if (!exword() && !texhyphen() && !suffix())
56: digram();
57:
58: /* this appears to sort hyphenation points into increasing order */
59: *hyp++ = 0;
60: if (*hyptr)
61: for (j = 1; j; ) {
62: j = 0;
63: for (hyp = hyptr + 1; *hyp != 0; hyp++) {
64: if (*(hyp - 1) > *hyp) {
65: j++;
66: i = *hyp;
67: *hyp = *(hyp - 1);
68: *(hyp - 1) = i;
69: }
70: }
71: }
72: }
73:
74: static alpha(Tchar i) /* non-zero if really alphabetic */
75: {
76: if (ismot(i))
77: return 0;
78: else if (cbits(i) >= ALPHABET) /* this isn't very elegant, but there's */
79: return 0; /* no good way to make sure i is in range for */
80: else /* the call of isalpha */
81: return isalpha(cbits(i));
82: }
83:
84:
85: punct(Tchar i)
86: {
87: if (!i || alpha(i))
88: return(0);
89: else
90: return(1);
91: }
92:
93:
94: void caseha(void) /* set hyphenation algorithm */
95: {
96: hyphalg = HYPHALG;
97: if (skip())
98: return;
99: noscale++;
100: hyphalg = atoi0();
101: noscale = 0;
102: }
103:
104:
105: void caseht(void) /* set hyphenation threshold; not in manual! */
106: {
107: thresh = THRESH;
108: if (skip())
109: return;
110: noscale++;
111: thresh = atoi0();
112: noscale = 0;
113: }
114:
115:
116: void casehw(void)
117: {
118: int i, k;
119: char *j;
120: Tchar t;
121:
122: k = 0;
123: while (!skip()) {
124: if ((j = nexth) >= hbuf + NHEX - 2)
125: goto full;
126: for (; ; ) {
127: if (ismot(t = getch()))
128: continue;
129: i = cbits(t);
130: if (i == ' ' || i == '\n') {
131: *j++ = 0;
132: nexth = j;
133: *j = 0;
134: if (i == ' ')
135: break;
136: else
137: return;
138: }
139: if (i == '-') {
140: k = HY_BIT;
141: continue;
142: }
143: *j++ = maplow(i) | k;
144: k = 0;
145: if (j >= hbuf + NHEX - 2)
146: goto full;
147: }
148: }
149: return;
150: full:
151: ERROR "exception word list full." WARN;
152: *nexth = 0;
153: }
154:
155:
156: int exword(void)
157: {
158: Tchar *w;
159: char *e, *save;
160:
161: e = hbuf;
162: while (1) {
163: save = e;
164: if (*e == 0)
165: return(0);
166: w = wdstart;
167: while (*e && w <= hyend && (*e & 0177) == maplow(cbits(*w))) {
168: e++;
169: w++;
170: }
171: if (!*e) {
172: if (w-1 == hyend || (w == wdend && maplow(cbits(*w)) == 's')) {
173: w = wdstart;
174: for (e = save; *e; e++) {
175: if (*e & HY_BIT)
176: *hyp++ = w;
177: if (hyp > hyptr + NHYP - 1)
178: hyp = hyptr + NHYP - 1;
179: w++;
180: }
181: return(1);
182: } else {
183: e++;
184: continue;
185: }
186: } else
187: while (*e++)
188: ;
189: }
190: }
191:
192:
193: suffix(void)
194: {
195: Tchar *w;
196: char *s, *s0;
197: Tchar i;
198: extern char *suftab[];
199:
200: again:
201: i = cbits(*hyend);
202: if (!alpha(i))
203: return(0);
204: if (i < 'a')
205: i -= 'A' - 'a';
206: if ((s0 = suftab[i-'a']) == 0)
207: return(0);
208: for (;;) {
209: if ((i = *s0 & 017) == 0)
210: return(0);
211: s = s0 + i - 1;
212: w = hyend - 1;
213: while (s > s0 && w >= wdstart && (*s & 0177) == maplow(cbits(*w))) {
214: s--;
215: w--;
216: }
217: if (s == s0)
218: break;
219: s0 += i;
220: }
221: s = s0 + i - 1;
222: w = hyend;
223: if (*s0 & HY_BIT)
224: goto mark;
225: while (s > s0) {
226: w--;
227: if (*s-- & HY_BIT) {
228: mark:
229: hyend = w - 1;
230: if (*s0 & 0100) /* 0100 used in suftab to encode something too */
231: continue;
232: if (!chkvow(w))
233: return(0);
234: *hyp++ = w;
235: }
236: }
237: if (*s0 & 040)
238: return(0);
239: if (exword())
240: return(1);
241: goto again;
242: }
243:
244:
245: maplow(int i)
246: {
247: if (isupper(i))
248: i = tolower(i);
249: return(i);
250: }
251:
252:
253: vowel(int i)
254: {
255: switch (i) {
256: case 'a': case 'A':
257: case 'e': case 'E':
258: case 'i': case 'I':
259: case 'o': case 'O':
260: case 'u': case 'U':
261: case 'y': case 'Y':
262: return(1);
263: default:
264: return(0);
265: }
266: }
267:
268:
269: Tchar *chkvow(Tchar *w)
270: {
271: while (--w >= wdstart)
272: if (vowel(cbits(*w)))
273: return(w);
274: return(0);
275: }
276:
277:
278: void digram(void)
279: {
280: Tchar *w;
281: int val;
282: Tchar *nhyend, *maxw;
283: int maxval;
284: extern char bxh[26][13], bxxh[26][13], xxh[26][13], xhx[26][13], hxx[26][13];
285:
286: again:
287: if (!(w = chkvow(hyend + 1)))
288: return;
289: hyend = w;
290: if (!(w = chkvow(hyend)))
291: return;
292: nhyend = w;
293: maxval = 0;
294: w--;
295: while (++w < hyend && w < wdend - 1) {
296: val = 1;
297: if (w == wdstart)
298: val *= dilook('a', cbits(*w), bxh);
299: else if (w == wdstart + 1)
300: val *= dilook(cbits(*(w-1)), cbits(*w), bxxh);
301: else
302: val *= dilook(cbits(*(w-1)), cbits(*w), xxh);
303: val *= dilook(cbits(*w), cbits(*(w+1)), xhx);
304: val *= dilook(cbits(*(w+1)), cbits(*(w+2)), hxx);
305: if (val > maxval) {
306: maxval = val;
307: maxw = w + 1;
308: }
309: }
310: hyend = nhyend;
311: if (maxval > thresh)
312: *hyp++ = maxw;
313: goto again;
314: }
315:
316:
317: dilook(int a, int b, char t[26][13])
318: {
319: int i, j;
320:
321: i = t[maplow(a)-'a'][(j = maplow(b)-'a')/2];
322: if (!(j & 01))
323: i >>= 4;
324: return(i & 017);
325: }
326:
327:
328: /* here beginneth the tex hyphenation code, as interpreted freely */
329: /* the main difference is that there is no attempt to squeeze space */
330: /* as tightly at tex does. */
331:
332: static int texit(Tchar *, Tchar *);
333: static int readpats(void);
334: static void install(char *);
335: static void fixup(void);
336: static int trieindex(int, int);
337:
338: static char pats[50000]; /* size ought to be computed dynamically */
339: static char *nextpat = pats;
340: static char *trie[27*27]; /* english-specific sizes */
341:
342: int texhyphen(void)
343: {
344: static int loaded = 0; /* -1: couldn't find tex file */
345:
346: if (hyphalg == 0 || loaded == -1) /* non-zero => tex for now */
347: return 0;
348: if (loaded == 0) {
349: if (readpats())
350: loaded = 1;
351: else
352: loaded = -1;
353: }
354: return texit(wdstart, wdend);
355: }
356:
357: static int texit(Tchar *start, Tchar *end) /* hyphenate as in tex, return # found */
358: {
359: int nw, i, k, eq, cnt[500];
360: char w[500+1], *np, *pp, *wp, *xpp, *xwp;
361:
362: w[0] = '.';
363: for (nw = 1; start <= end && nw < 500-1; nw++, start++)
364: w[nw] = maplow(tolower(cbits(*start)));
365: start -= (nw - 1);
366: w[nw++] = '.';
367: w[nw] = 0;
368: /*
369: * printf("try %s\n", w);
370: */
371: for (i = 0; i <= nw; i++)
372: cnt[i] = '0';
373:
374: for (wp = w; wp < w + nw; wp++) {
375: for (pp = trie[trieindex(*wp, *(wp+1))]; pp < nextpat; ) {
376: if (pp == 0 /* no trie entry */
377: || *pp != *wp /* no match on 1st letter */
378: || *(pp+1) != *(wp+1)) /* no match on 2nd letter */
379: break; /* so move to next letter of word */
380: eq = 1;
381: for (xpp = pp+2, xwp = wp+2; *xpp; )
382: if (*xpp++ != *xwp++) {
383: eq = 0;
384: break;
385: }
386: if (eq) {
387: np = xpp+1; /* numpat */
388: for (k = wp-w; *np; k++, np++)
389: if (*np > cnt[k])
390: cnt[k] = *np;
391: /*
392: * printf("match: %s %s\n", pp, xpp+1);
393: */
394: }
395: pp += *(pp-1); /* skip over pattern and numbers to next */
396: }
397: }
398: /*
399: * for (i = 0; i < nw; i++) printf("%c", w[i]);
400: * printf(" ");
401: * for (i = 0; i <= nw; i++) printf("%c", cnt[i]);
402: * printf("\n");
403: */
404: /*
405: * for (i = 1; i < nw - 1; i++) {
406: * if (i > 2 && i < nw - 3 && cnt[i] % 2)
407: * printf("-");
408: * if (cbits(start[i-1]) != '.')
409: * printf("%c", cbits(start[i-1]));
410: * }
411: * printf("\n");
412: */
413: for (i = 1; i < nw -1; i++)
414: if (i > 2 && i < nw - 3 && cnt[i] % 2)
415: *hyp++ = start + i - 1;
416: return hyp - hyptr; /* non-zero if a hyphen was found */
417: }
418:
419: /*
420: This code assumes that hyphen.tex looks like
421: % some comments
422: \patterns{ % more comments
423: pat5ter4ns, 1 per line, SORTED, nothing else
424: }
425: more goo
426: \hyphenation{ % more comments
427: ex-cep-tions, one per line; i ignore this part for now
428: }
429:
430: this code is NOT robust against variations. unfortunately,
431: it looks like every local language version of this file has
432: a different format. i have also made no provision for weird
433: characters. sigh.
434: */
435:
436: static int readpats(void)
437: {
438: FILE *fp;
439: char buf[200], buf1[200];
440:
441: if ((fp = fopen(TEXHYPHENS, "r")) == NULL
442: && (fp = fopen(ALTHYPHENS, "r")) == NULL) {
443: ERROR "warning: can't find hyphen.tex" WARN;
444: return 0;
445: }
446:
447: while (fgets(buf, sizeof buf, fp) != NULL) {
448: sscanf(buf, "%s", buf1);
449: if (strcmp(buf1, "\\patterns{") == 0)
450: break;
451: }
452: while (fgets(buf, sizeof buf, fp) != NULL) {
453: if (buf[0] == '}')
454: break;
455: install(buf);
456: }
457: fclose(fp);
458: fixup();
459: return 1;
460: }
461:
462: static void install(char *s) /* map ab4c5de to: 12 abcde \0 00405 \0 */
463: {
464: int npat, lastpat;
465: char num[500], *onextpat = nextpat;
466:
467: num[0] = '0';
468: *nextpat++ = ' '; /* fill in with count later */
469: for (npat = lastpat = 0; *s != '\n' && *s != '\0'; s++) {
470: if (isdigit(*s)) {
471: num[npat] = *s;
472: lastpat = npat;
473: } else {
474: *nextpat++ = *s;
475: npat++;
476: num[npat] = '0';
477: }
478: }
479: *nextpat++ = 0;
480: if (nextpat > pats + sizeof(pats)-20) {
481: ERROR "tex hyphenation table overflow, tail end ignored" WARN;
482: nextpat = onextpat;
483: }
484: num[lastpat+1] = 0;
485: strcat(nextpat, num);
486: nextpat += strlen(nextpat) + 1;
487: }
488:
489: static void fixup(void) /* build indexes of where . a b c ... start */
490: {
491: char *p, *lastc;
492: int n;
493:
494: for (lastc = pats, p = pats+1; p < nextpat; p++)
495: if (*p == ' ') {
496: *lastc = p - lastc;
497: lastc = p;
498: }
499: *lastc = p - lastc;
500: for (p = pats+1; p < nextpat; ) {
501: n = trieindex(p[0], p[1]);
502: if (trie[n] == 0)
503: trie[n] = p;
504: p += p[-1];
505: }
506: /* printf("pats = %d\n", nextpat - pats); */
507: }
508:
509: static int trieindex(int d1, int d2)
510: {
511: return 27 * (d1 == '.' ? 0 : d1 - 'a' + 1) + (d2 == '.' ? 0 : d2 - 'a' + 1);
512: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.