|
|
1.1 root 1: /* Copyright (c) 1989, 1990 AT&T --- All Rights Reserved. */
2: /* THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF AT&T. */
3: /* The copyright notice does not imply actual or intended publication. */
4: /* AUTHORS: */
5: /* H. S. Baird - ATT-BL MH - first versions */
6:
7: /* Text.h - typedefs, constants, and function declarations for document-images
8: (see Text.c for companion functions.)
9: INCLUDES
10: requires prior:
11: #include "stdocr.h"
12: */
13:
14: #define dbg_fwrb_toa (F) /* err("%s",R_toa()) for each record written */
15: #define dbg_frdb_toa (F) /* err("%s",R_toa()) for each record read */
16:
17: #define DIM_VERSION (0) /* current version no. of Dim-file format */
18:
19: #include "Bfeats.h"
20:
21: #define Ident int /* identification bits [MUST BE >=32 bits] */
22:
23: #define fwri_Ident(F,V) fwri_uint4((F),(V))
24: #define frdi_Ident(F) frdi_uint4(F)
25:
26: /* Ident bits */
27: /* identifies external file record type(s) */
28: #define IsPage 04000000000
29: #define IsBlock 02000000000
30: #define IsTxtln 01000000000
31: #define IsWord 00020000000
32: #define IsChar 00400000000
33: #define IsBlob 00200000000
34: #define IsRun 00100000000 /* should be IsRuns or IsLag */
35: #define IsRuns (IsRun)
36: #define IsInterp 00040000000 /* Char interpretation */
37: #define IsBdy 00010000000
38: #define IsShapes 00004000000
39: #define IsLag 00002000000
40: #define IsBfeats 00001000000
41: #define IsSfeats 00000400000
42: #define IsWordInterp 00000200000
43: /* Runs: set in Blob records' ids.
44: At most one of these may be set in main memory.
45: Only Runs_ff or Runs_g4 may be set in peripheral file format. */
46: #define Runs_f 00000010000 /* Runs are in list, starting *.r.f */
47: #define Runs_ff 00000020000 /* RunFs are in array, at *.r.ff */
48: #define Runs_seek 00000040000 /* Runs are still in file, at *.seek */
49: #define Runs_g4 00000100000 /* Runs are in CCITT Group 4 format */
50:
51: /* CCITT Group 4 format in peripheral files consists of (a) an unsigned long count,
52: then (b) that many bytes of Group 4 encoding exactly as in CCITT
53: Recommendation T.6, except that no EOFB code is appended, since the end of
54: the bitmap can be detected using the bounding box of the owning Blob.
55: Instead, the last scan line of the bitmap is merely padded to the nearest
56: full byte with '0' bits. This compresses the representation by about
57: a factor of 8 on average, compared to RunF/RunFS encoding. */
58:
59: #define IsALL (IsPage|IsBlock|IsTxtln|IsWord|IsChar|IsBlob|IsRun|IsBdy|IsInterp|IsShapes|IsLag|IsBfeats|IsSfeats|IsWordInterp)
60: #define IsNONE 0
61:
62:
63: /* Enable optional debugging-support code to maintain a count of selected record
64: types that are allocated using alloc_* free_* and dup_* functions. The counts
65: are in units of records, not bytes. By design, has no effect at all on
66: correctness. */
67: #define ALLOC_CENSUS (0)
68:
69: #if ALLOC_CENSUS
70:
71: typedef struct Census {
72: int Page_mny;
73: int Block_mny;
74: int Txtln_mny;
75: int Word_mny;
76: int Char_mny;
77: int Blob_mny;
78: int Run_mny;
79: int Interp_mny;
80: int Bfeats_mny;
81: int BMask_mny;
82: } Census;
83:
84: #define Init_Census {0,0,0,0,0,0,0,0,0,0}
85: #if MAIN
86: Census empty_Census = Init_Census;
87: Census _CENSUS = Init_Census;
88: #else
89: extern Census empty_Census;
90: extern Census _CENSUS;
91: #endif
92:
93: #define alloc_census(id,n) _CENSUS./**/id/**/_mny += (n)
94: #define free_census(id,n) _CENSUS./**/id/**/_mny -= (n)
95: #define err_census(S,C) err("%s P%d B%d l%d w%d c%d b%d r%d i%d bf%d bm%d",\
96: S,(C)->Page_mny,(C)->Block_mny,(C)->Txtln_mny,(C)->Word_mny,\
97: (C)->Char_mny,(C)->Blob_mny,(C)->Run_mny,(C)->Interp_mny,\
98: (C)->Bfeats_mny,(C)->BMask_mny )
99: #define err_census_all err_census("allocated: ",&(_CENSUS))
100: #define err_census_rec(rp) { \
101: Census *cs; cs = (Census *)census_rec((rp)); \
102: err_census(ident_toa((rp)->ident),cs); \
103: }
104: #else
105:
106: #define alloc_census(i,n)
107: #define free_census(i,n)
108: #define err_census(s,c)
109: #define err_census_all
110: #define err_census_rec(r)
111:
112: #endif
113:
114:
115: /* Most record types can own an ASCII label, which is simply a
116: '\0'-terminated string. Its uses are varied. */
117: #define MAX_LABEL_LEN 128 /* maximum no. characters in a label string */
118:
119: #if FWRI
120:
121: #if dbg_fwrb_toa
122: #define fwrb_label(F,L) { \
123: fwri_str((F),(L)); \
124: err("fwrb_label: \"%s\"",(L)); \
125: }
126: #else
127: #define fwrb_label(F,L) { \
128: fwri_str((F),(L)); \
129: }
130: #endif
131:
132: #else
133:
134: #if dbg_fwrb_toa
135: #define fwrb_label(F,L) { \
136: fputs((L),(F)); \
137: fputc('\0',(F)); \
138: err("fwrb_label: \"%s\"",(L)); \
139: }
140: #else
141: #define fwrb_label(F,L) { \
142: fputs((L),(F)); \
143: fputc('\0',(F)); \
144: }
145: #endif
146:
147: #endif
148:
149: char *frdb_label();
150:
151: /* A boundary is an ordered list of vertices.
152: In some uses, it is assumed to close: in this case, the first point
153: is not repeated at the end.
154: `vn' counts the no. vertices. bdy_trace omits consecutive duplicates and
155: compresses horizontal and vertical runs.
156: `per' counts the no. of pixels on the 8-connected boundary. An attempt has been
157: made not to count consecutive duplicate pixels, but this may be buggy.
158: `ren' counts the number of run-ends touched. Each run contributes two ends,
159: even if it is one pixel long. The sum of ren-counts among all bdys for a blob
160: should equal exactly twice the no. runs.
161: Note that vn<=per and vn<=2*ren, and (probably) ren<=per.
162: A ``smoothed'' version (courtesy of John Hobby) may be given in s[],
163: expressed in fractional pixels.
164: */
165: typedef struct Bdy {
166: Ident ident; /* shows type of boundary */
167: Bbx bx; /* bounding box (usually relative to blob's bx.a) */
168: long per; /* no. pixels in 8-connected perimeter */
169: int ren; /* no. run-ends touched */
170: int vn; /* no. distinct vertices in v[] */
171: Sp *v; /* array of vn+1 vertices, v[0]==v[vn] (malloc space) */
172: short fr; /* fraction of pixels used in smoothed outline */
173: int sn; /* no. distinct smoothed vertices in s[] */
174: Sp *s; /* array of sn+1 vertices, s[0]==s[sn] (malloc space) */
175: int an; /* no. vertices in polygonal approximation */
176: Sp **ap; /* approx'n: array of an ptrs into v[] (in malloc space) */
177: float err; /* error tolerance used for approximation */
178: int hn; /* no. vertices in convex hull of polygonal approx'n */
179: Sp ***hpp; /* convex hull: array of hn ptrs into ap[] (malloc) */
180: struct Bdy *n; /* for use when a member of a linked-list */
181: } Bdy;
182:
183: #define Init_Bdy {IsBdy,Init_Bbx,0,0,0,NULL,1,0,NULL,0,NULL,0.0,0,NULL,NULL}
184: #if MAIN
185: Bdy empty_Bdy = Init_Bdy;
186: #else
187: extern Bdy empty_Bdy;
188: #endif
189:
190: #define Bdy_verts 00000000001 /* vertices */
191: #define Bdy_approx 00000000002 /* polygonal approx'n */
192: #define Bdy_hull 00000000004 /* convex hull */
193: #define Bdy_ALL (Bdy_verts|Bdy_approx|Bdy_hull)
194: #define Bdy_ccw 00000000100 /* winding order is counter-clockwise*/
195: #define Bdy_half 00000000200 /* uses half-pixel boundary points */
196:
197: /* A boundaries-set is an ordered list of boundaries.
198: In some uses, they are used to enclose a connected region: in this case,
199: all are closed, and the first is conventionally the exterior and the others
200: are interior boundaries. The interior is then always to the left
201: of the boundary: that is, the exterior boundary is oriented counter-
202: clockwise, and the interior boundaries clockwise */
203: typedef struct Bdys {
204: int mny; /* no. boundaries */
205: long per; /* perimeter of all bdys */
206: Bdy *b; /* array of boundaries (malloc space) (sometimes first) */
207: } Bdys;
208:
209: #define Init_Bdys {0,0,NULL}
210: #if MAIN
211: Bdys empty_Bdys = Init_Bdys;
212: #else
213: extern Bdys empty_Bdys;
214: #endif
215:
216: /* A boundary edge is an ordered pair of vertices along a boundary. It implicitly
217: describes an 8-connected sequence of pixels from a to b, inclusive. Also
218: used for straight-line approximations to the set of pixels, and the convex hull
219: of such an approximation. */
220: typedef struct BdyEdge {
221: Bdy *byp; /* boundary to which it belongs */
222: Sp *ap,*bp; /* ptrs into byp->v */
223: long per; /* perimeter: no. 8-connected pixels */
224: Pp ctr; /* centroid */
225: Radians ang; /* ls-fitted angle (directed roughly from a to b) */
226: Pp a; /* endpoints a & b (with sub-pixel precision) */
227: Pp b;
228: } BdyEdge;
229:
230: #define Init_BdyEdge {NULL,NULL,NULL,0,Init_Zero_Pp,0.0,Init_Zero_Pp,Init_Zero_Pp}
231: #if MAIN
232: BdyEdge empty_BdyEdge = Init_BdyEdge;
233: #else
234: extern BdyEdge empty_BdyEdge;
235: #endif
236:
237: /* Ordered set of BdyEdges */
238: typedef struct BdyEdges {
239: int mny;
240: BdyEdge **pa; /* NULL-terminated array of pointers to BdyEdges */
241: } BdyEdges;
242:
243: #define Init_BdyEdges {0,NULL}
244: #if MAIN
245: BdyEdges empty_BdyEdges = Init_BdyEdges;
246: #else
247: extern BdyEdges empty_BdyEdges;
248: #endif
249:
250: /* Moments of a region (or boundary-list) of pixels */
251:
252: typedef struct Moments {
253: /* 0th moment */
254: int M00; /* area: sum of 1 */
255: /* 1st moments */
256: int M10; /* sum of xi */
257: int M01; /* sum of yi */
258: Pp c; /* centroid: M10/M00, M01/M00 */
259: /* 2nd moments (relative to centroid) */
260: float M20; /* sum of rxi*rxi */
261: float M11; /* sum of rxi*ryi */
262: float M02; /* sum of ryi*ryi */
263: Radians a; /* orientation angle, in [-PI/2,PI/2) (radians) */
264: Pp d; /* directional vector of principal axis */
265: } Moments;
266:
267: #define Init_Moments {0,0,0,{0.0,0.0},0.0,0.0,0.0,0.0,{0.0,0.0}}
268: #if MAIN
269: Moments zero_Moments = Init_Moments;
270: #else
271: extern Moments zero_Moments;
272: #endif
273:
274: /* functions in Text.c */
275: Bdy *alloc_bdy();
276: Bdys *alloc_bdys();
277:
278: /* functions in Bdy.c */
279: Bdys *dup_bdys_etc();
280: Bdy *dup_bdy_etc();
281: Bdys *boundaries();
282: char *moments_toa();
283: Moments *bdy_moments();
284: boolean fit_bdyedge();
285: BdyEdge *dup_bdyedge();
286: BdyEdge *append_bdyedge();
287: remove_bdyedge();
288: free_bdyedges();
289: BdyEdges *dup_bdyedges();
290:
291: /* Each Run is initially inserted into a `line set', owned by its scan Line.
292: Later, as connections are discovered, it joins a `tree set'.
293: When the forest of trees of which it is a part is finally complete, the Run
294: is removed from its `line set', and added to a `blob set'.
295: */
296:
297: typedef struct Run { /* internal (main memory) record */
298: Scoor y, xs, xe; /* coordinates of black interval (y,[xs,xe]) */
299: struct Run *n; /* line & blob sets: next Run */
300: unsigned short ad, bd; /* tree set: above,below degrees (no. conn'd)*/
301: struct Run *ac, *bc; /* tree set: above,below leftmost connections */
302: union { struct Tree *o; /* tree set: owner Tree */
303: int no; /* blob set: sequence no. 0,1,... in set */
304: } u; /* (overlain fields) */
305: } Run;
306:
307: #define Init_Run {0,0,0,NULL,0,0,NULL,NULL}
308: #if MAIN
309: Run empty_Run = Init_Run;
310: #else
311: extern Run empty_Run;
312: #endif
313:
314: /* Peripheral file format. `ac' and `bc' are relative to the position
315: of this run in the canonical run order. */
316: typedef struct RunF { /* external (peripheral file) record (full size) */
317: Scoor y, xs, xe; /* coordinates of black interval (y,[xs,xe]) */
318: unsigned short ad, bd; /* above,below degrees */
319: unsigned short ac, bc; /* above,below leftmost connections (indices) */
320: } RunF;
321:
322: #define Init_RunF {0,0,0,0,0,0,0}
323: #if MAIN
324: RunF empty_RunF = Init_RunF;
325: #else
326: extern RunF empty_RunF;
327: #endif
328:
329: #define fwri_RunF(F,P) { \
330: fwri_Scoor((F),(P)->y); \
331: fwri_Scoor((F),(P)->xs); \
332: fwri_Scoor((F),(P)->xe); \
333: fwri_uint2((F),(P)->ad); \
334: fwri_uint2((F),(P)->bd); \
335: fwri_uint2((F),(P)->ac); \
336: fwri_uint2((F),(P)->bc); \
337: }
338:
339: #define frdi_RunF(F,P) ( feof(F)? 0 : ( \
340: (P)->y=frdi_Scoor(F), \
341: (P)->xs=frdi_Scoor(F), \
342: (P)->xe=frdi_Scoor(F), \
343: (P)->ad=frdi_uint2(F), \
344: (P)->bd=frdi_uint2(F), \
345: (P)->ac=frdi_uint2(F), \
346: (P)->bc=frdi_uint2(F), \
347: (ferror(F)? -errno: 1) ) )
348:
349: /* In `well-behaved' text, the overwhelming majority of Blobs are small
350: enough that all their Runs can be encoded using character data fields,
351: a factor of two saving, which is important since a dense IEEE proceedings
352: page blob file would otherwise require 2.3Mbytes */
353: typedef struct RunFS { /* external (peripheral file) record (small size) */
354: unsigned char y, xs, xe;/* coordinates of black interval (y,[xs,xe]) */
355: unsigned char ad, bd; /* above,below degrees */
356: unsigned char ac, bc; /* above,below leftmost connections (indices) */
357: } RunFS;
358:
359: #define Init_RunFS {0,0,0,0,0,0,0}
360: #if MAIN
361: RunFS empty_RunFS = Init_RunFS;
362: #else
363: extern RunFS empty_RunFS;
364: #endif
365:
366: #define fwri_RunFS(F,P) { \
367: fwri_uint1((F),(P)->y); \
368: fwri_uint1((F),(P)->xs); \
369: fwri_uint1((F),(P)->xe); \
370: fwri_uint1((F),(P)->ad); \
371: fwri_uint1((F),(P)->bd); \
372: fwri_uint1((F),(P)->ac); \
373: fwri_uint1((F),(P)->bc); \
374: }
375:
376: #define frdi_RunFS(F,P) ( feof(F)? 0: ( \
377: (P)->y=frdi_uint1(F), \
378: (P)->xs=frdi_uint1(F), \
379: (P)->xe=frdi_uint1(F), \
380: (P)->ad=frdi_uint1(F), \
381: (P)->bd=frdi_uint1(F), \
382: (P)->ac=frdi_uint1(F), \
383: (P)->bc=frdi_uint1(F), \
384: (ferror(F)? -errno: 1) ) )
385:
386: /* Set of runs. PROPOSED NEW FORMAT. Not yet incorporated widely. */
387: typedef struct Runs {
388: Ident ident; /* IsRuns & Runs_fi, Runs_ff, Runs_fs, or Runs_sk flags */
389: int mny; /* no. runs */
390: union { /* access to runs */
391: struct Run *fi; /* first run of singly-linked list */
392: struct RunF *ff; /* top of RunF[mny] array */
393: struct RunFS *fs; /* top of RunFS[mny] array */
394: long sk; /* seek(F,seek,0) will find them in file F */
395: } r;
396: } Runs;
397:
398: #define Init_Runs {IsRuns,0} /* NOTE: can't initialize union */
399: #if MAIN
400: Runs empty_Runs = Init_Runs;
401: #else
402: extern Runs empty_Runs;
403: #endif
404:
405: /* INTERNAL management */
406:
407: #if !MAIN
408: extern
409: #endif
410: struct {
411: int incr; /* size of each pool[i] */
412: int pools; /* no. of pools allocated */
413: Run **pool; /* malloc space Run pool[pools][0..incr-1] */
414: int next; /* the next avail Run is: pool[pools-1][next] */
415: Run *free; /* head of free lifo list (NULL if none) */
416: Run *cur; /* most-recently allocated Run */
417: int total; /* total no. ever allocated */
418: boolean dbg;
419: } _RunPool
420: #if MAIN
421: = {0,0,NULL,0,NULL,NULL,0,F}
422: #endif
423: ;
424:
425: /* Run management routines (Text.c) */
426: boolean alloc_run_pool();
427: free_run_pool();
428: Run *hard_alloc_run();
429: err_run();
430: err_runb();
431: err_runf();
432: err_runfs();
433: err_run_stats();
434:
435: /* Allocate a Run from the RunPool (returns (Run *)) -- mostly inline */
436: #define alloc_run() ( _RunPool.total++, (_RunPool.free!=NULL)? \
437: (_RunPool.cur=_RunPool.free,_RunPool.free=_RunPool.cur->n, \
438: *(_RunPool.cur)=empty_Run,_RunPool.cur): \
439: ( (_RunPool.next<_RunPool.incr)? \
440: (_RunPool.cur=_RunPool.pool[_RunPool.pools-1]+(_RunPool.next++), \
441: *(_RunPool.cur)=empty_Run,_RunPool.cur): \
442: hard_alloc_run() ) )
443:
444: /* Free a Run back into the RunPool -- entirely inline */
445: #define free_run(rp) { (rp)->n = _RunPool.free; _RunPool.free = (rp); }
446:
447: /* EXTERNAL file format:
448: If BlobF.runs is zero, then conventionally the Runs have simply been omitted.
449: The RunF.y, RunF.xs, & RunF.xe coordinates are offsets from BlobF.bx.a
450: (their blob's left-top corner). RunF.ac & RunF.bc index into an array of
451: only those RunF records belonging to the current BlobF, in ascending
452: lexicographic order on (RunF.y,RunF.xs) -- so that they are in the range
453: [0,BlobF.runs-1].
454: IMPROVEMENTS:
455: */
456:
457: /* some subroutines are too lazy to handle indefinitely large blobs */
458: #define Runs_Max 10000
459:
460: /* A Blob is (formally) a maximal 8-connected set of black pixels.
461: The connectivity algorithm finds them in strictly increasing order on
462: (y,xe) of its Run with highest (y,xe).
463: */
464:
465: typedef struct Blob { /* internal (main memory) record */
466: Ident ident; /* identification bits */
467: Seq no; /* blob sequence no */
468: Bbx bx;
469: long area;
470: long per;
471: struct Blob *n; /* free set: next blob */
472: Merit m; /* Only used locally (not for peripheral file) */
473: Runs *rsp; /* runs (not yet used) */
474: Bdys *bdsp; /* boundaries (in malloc space); NULL if none */
475: /* presently in use (but planned to be replaced by Runs) */
476: int runs;
477: union { /* to find runs */
478: struct Run *f; /* blob set: first run */
479: struct RunF *ff; /* top of RunF array */
480: long seek; /* seek(f,seek,0) will find them */
481: } r;
482: } Blob;
483:
484: #define Init_Blob {IsBlob,0,Init_Bbx,0,0,NULL,0.0,NULL,NULL,0,}
485: #if MAIN
486: Blob empty_Blob = Init_Blob;
487: #else
488: extern Blob empty_Blob;
489: #endif
490:
491: typedef struct Blobs { /* Blob set */
492: int mny; /* the number of pointers in set */
493: Blob **bpa; /* pts to NULL-terminated array[mny+1] of pointers */
494: } Blobs;
495:
496: #define Init_Blobs {0,NULL}
497: #if MAIN
498: Blobs empty_Blobs = Init_Blobs;
499: #else
500: extern Blobs empty_Blobs;
501: #endif
502:
503: /* Singly-linked list of Blobs. Only forward `next' links Blob.n are used. */
504: typedef struct Blobl { /* Blob list */
505: int mny; /* the number in set */
506: Blob *fi; /* to first */
507: Blob *la; /* to last */
508: } Blobl;
509:
510: #define Init_Blobl {0,NULL,NULL}
511: #if MAIN
512: Blobl empty_Blobl = Init_Blobl;
513: #else
514: extern Blobl empty_Blobl;
515: #endif
516:
517: typedef struct BlobF { /* external file format */
518: Ident ident; /* identification bits: IsBlob must be set */
519: Bbx bx;
520: long area;
521: long per;
522: int runs; /* no. runs to follow */
523: short bdys; /* no. bdys to follow */
524: } BlobF;
525:
526: /* Blob identification bits */
527: #define Blob_lm 00000000001 /* touches left margin */
528: #define Blob_rm 00000000002 /* touches right margin */
529: #define Blob_tm 00000000004 /* touches top margin */
530: #define Blob_bm 00000000010 /* touches bottom margin */
531: #define Blob_chopt 00000000020 /* chopped (at the top) */
532: #define Blob_chopb 00000000040 /* chopped (at the bottom) */
533: #define Blob_chopl 00000002000 /* chopped (at the left) */
534: #define Blob_chopr 00000004000 /* chopped (at the right) */
535: #define Blob_small 00000000200 /* its runs (can be) compressed x2 */
536: #define Blob_local 00000000400 /* unassigned: avail for local pgm use */
537:
538: /* INTERNAL management: */
539:
540: int hi_blob_no; /* current highest blob no */
541:
542: /* Blobs are allocated from a pool of free ones */
543: int blob_max;
544: Blob *blob_pool;
545: Blob blob_fr; /* head of list of free blobs */
546: int blob_fr_mny;
547: int blob_hi; /* high-water mark in blob pool */
548: int blob_chopped; /* total no. of blobs that were chopped */
549: boolean blob_debug; /* debug traces? */
550:
551: /* EXTERNAL file format:
552: A Blob file consists of an arbitrary number of:
553: BlobF record, followed by BlobF.runs instances of:
554: RunF record
555: If BlobF.runs is zero, then conventionally the Runs have simply been omitted.
556: The RunF.y, RunF.xs, & RunF.xe coordinates are relative offsets from BlobF.bx.a
557: (their blob's left-top corner). RunF.ac & RunF.bc index into an array of
558: only those RunF records belonging to the current BlobF, in ascending
559: lexicographic order on (RunF.y,RunF.xs) -- so that they are in the range
560: [0,BlobF.runs-1]. If ad(or, bd)==0, the ac(or, bc) is undefined (conn sets
561: them conventionally to 0).
562: */
563:
564: /* Blob management routines (conBlob.c) */
565: Blob *alloc_blob();
566: free_blob();
567: boolean alloc_blob_pool();
568: free_blob_pool();
569: Blob *alloc_pool_blob();
570: free_pool_blob();
571: out_blob();
572: fwrb_blob_etc();
573: boolean frdb_blob_etc();
574: boolean frdb_runfs();
575: err_blob();
576: err_blob_runs();
577: err_blob_runfs();
578: err_blob_briefly();
579: err_blobf();
580: err_blob_stats();
581: boolean blob_small();
582:
583: /* Compute height-above-baseline in ems of Char *cp w.r.t. Txtln *lp,
584: on a page of y-resolution res. The txtln's `basl' & `size' must be set up. */
585: #define char_bhgt(cp,lp,res) \
586: ((((cp)->bx.b.y - (lp)->basl)/(double)(res)*INS_PER_PT*(lp)->size))
587:
588: /* an Interpretation of a Char */
589: typedef struct Interp {
590: Ident ident;
591: struct Cl *clp;
592: struct Class *clsp;
593: ClassId ci; /* class id (font, size, name, variant) */
594: Merit mshap; /* shape merit in [0,1] */
595: Pts size; /* implied text size */
596: Merit msize; /* size merit in [0,1] */
597: Scoor basl; /* implied absolute baseline location */
598: Merit mbhgt; /* height-above-baseline merit in [0,1] */
599: Merit m; /* match merit (due to mshap, msize, & mbhgt) */
600: Prob p; /* approximate probability */
601: struct Interp *n; /* next in singly-linked list */
602: } Interp;
603:
604: #define Init_Interp {IsInterp,NULL,NULL,Init_ClassId,0.0,0.0,0.0,0,0.0,0.0,0.0,NULL}
605: #if MAIN
606: Interp empty_Interp = Init_Interp;
607: #else
608: extern Interp empty_Interp;
609: #endif
610:
611: /* Interp.ident flags: */
612: #define Interp_spelled 00000000001 /* has passed a spelling check */
613:
614: #define free_interp(i) {free((i)); free_census(Interp,1); }
615:
616: typedef struct InterpF {
617: Ident ident;
618: ClassId ci; /* class id (font, size, name, variant) */
619: Merit mshap; /* shape merit */
620: Pts size; /* implied text size (in points) */
621: Merit msize; /* size merit */
622: Scoor basl; /* implied baseline location */
623: Merit mbhgt; /* height-above-baseline merit */
624: Merit m; /* match merit (due to shp, siz, hgt) */
625: } InterpF;
626:
627: /* a list of interpretations */
628: typedef struct Interpl {
629: short mny; /* no. in list */
630: struct Interp *fi; /* first in list (mny==0 -> fi==NULL) */
631: } Interpl;
632:
633: #define Init_Interpl {0,NULL}
634: #if MAIN
635: Interpl empty_Interpl = Init_Interpl;
636: #else
637: extern Interpl empty_Interpl;
638: #endif
639:
640: /* a set of interpretations */
641: typedef struct Interps {
642: short mny; /* no. in set */
643: struct Interp **pa; /* NULL-terminated array of ptrs (malloc spc) */
644: Merit m; /* combined merit (normalized product) */
645: } Interps;
646:
647: #define Init_Interps {0,NULL,0.0}
648: #if MAIN
649: Interps empty_Interps = Init_Interps;
650: #else
651: extern Interps empty_Interps;
652: #endif
653:
654: typedef struct Shapes {
655: short mny; /* no. items in set */
656: short alloc; /* no. items that can fit in array (alloc>=mny) */
657: Nb_s *sa; /* ptr to contiguous array (malloc space) */
658: } Shapes;
659:
660: #define Init_Shapes {0,0,NULL}
661: #if MAIN
662: Shapes empty_Shapes = Init_Shapes;
663: #else
664: extern Shapes empty_Shapes;
665: #endif
666:
667: #define SH_INCR (20) /* Shapes are allocated by this increment */
668:
669: #define init_sh(sh) { \
670: (sh)->alloc = SH_INCR; \
671: if(((sh)->sa=(Nb_s *)malloc((sh)->alloc*sizeof(Nb_s)))==NULL) \
672: abort("can't alloc sh->sa[%d]",(sh)->alloc); \
673: (sh)->mny = 0; \
674: }
675:
676: #define add_sh(s,sh) { \
677: if((sh)->mny==(sh)->alloc) { \
678: (sh)->alloc += SH_INCR; \
679: if(((sh)->sa=(Nb_s *)realloc((sh)->sa,(sh)->alloc*sizeof(Nb_s)))==NULL) \
680: abort("can't alloc (sh)->sa[%d]",(sh)->alloc); \
681: }; \
682: (sh)->sa[(sh)->mny++] = *(s); \
683: }
684:
685: /* Parameters governing the pseudo-random generation of a Char image
686: using a 1st-order statistical model of imaging. */
687: typedef struct RanParms {
688: short res_x; /* -r resolution (scanner pels/inch) */
689: short res_y; /* (for now, equal to res_x) */
690: Pts size; /* -p size of text */
691: Radians skew; /* -a skew angle */
692: Ems bhgt; /* -b height above baseline */
693: float blur; /* -e blurring std err (scanner pels) */
694: float jitter; /* -j jitter std err (scanner pels) */
695: float kern; /* -k kerning std err (scanner pels) */
696: float speckle; /* -s pel-wise additive noise std err (scanner pels) */
697: float thresh; /* -t threshold for binarization */
698: float xscale; /* -x horizontal scaling (dimensionless) */
699: float yscale; /* -y vertical scaling (dimensionless) */
700: } RanParms;
701: #define Init_RanParms {0,0,0.0,0.0,0.0,0.7,0.0,0.0,0.125,0.25,1.0,1.0}
702: #if MAIN
703: RanParms empty_RanParms = Init_RanParms;
704: #else
705: extern RanParms empty_RanParms;
706: #endif
707:
708: RanParms *alloc_ranparms();
709: RanParms *dup_ranparms();
710: char *ranparms_toa();
711: RanParms *ato_ranparms();
712: fwrb_ranparms();
713: int frdb_ranparms();
714:
715: /* Char - a character: isolated, elementary symbol of the writing system;
716: linguists might call this a `graph' */
717: typedef struct Char {
718: Ident ident; /* feature bits */
719: Bbx bx;
720: Scoor csp; /* space before character in Txtln (abs. coords) */
721: long area; /* no. pixels */
722: long per; /* perimeter (all bdys) */
723: Scoor basl; /* baseline (absolute coordinates, local copy) */
724: /* next should be Blobl */
725: int bmny; /* no. blobs in Char */
726: struct Blob *fi; /* 1st in list (p.n ptrs) (bmny==0 -> fi==NULL) */
727: Pval *sfv; /* scalar-features (SFv) */
728: Shapes sh; /* set of shapes (size- & loc'n-invariant) */
729: Bfeats *bfsp; /* binary features */
730: RanParms *rp; /* randomizing parameters */
731: Interpl il; /* interpretations */
732: char *l; /* label (ASCII string in malloc space) */
733: } Char;
734:
735: #define Init_Char {IsChar,Init_Bbx,0,0L,0L,Scoor_MIN,0,NULL,NULL,Init_Shapes,NULL,NULL,Init_Interpl,NULL}
736: #if MAIN
737: Char empty_Char = Init_Char;
738: #else
739: extern Char empty_Char;
740: #endif
741:
742: /* Char.ident flags: */
743: #define Char_spelled 00000000001 /* Its 1st Interp is in correct spelling */
744: #define Char_confused 00000000002 /* The classifier may have confused this */
745: #define Char_termhyp 00000000004 /* a terminal hyphen of its Word */
746: #define Char_omit 00000000010 /* can be omitted */
747: #define Char_label 00000000020 /* has an ASCII string label */
748: #define Char_ranparms 00000000040 /* has RanParms */
749: #define Char_split 00000000100 /* resulted from splitting a Char */
750: #define Char_merged 00000000200 /* resulted from merging Chars */
751:
752: typedef struct Chars {
753: int mny; /* mny==0 -> cpa==NULL */
754: Char **cpa; /* pts to NULL-term'd array of ptrs (in malloc space) */
755: } Chars;
756:
757: #define Init_Chars {0,NULL}
758: #if MAIN
759: Chars empty_Chars = Init_Chars;
760: #else
761: extern Chars empty_Chars;
762: #endif
763:
764: /* CharF - Char external file format */
765: typedef struct CharF {
766: Ident ident; /* feature bits: IsChar must be set */
767: Bbx bx;
768: Scoor csp; /* space before character in Txtln (abs. coords) */
769: long area;
770: long per;
771: short bmny; /* no. blobs to follow */
772: short imny; /* no. interpretations (follows immediately) */
773: short sfmny; /* no. scalar features (follows immediately) */
774: short shmny; /* no. shape features (follows immediately) */
775: short bfmny; /* no. binary features (follows immediately) */
776: /* if ident&Char_ranparms, RanParms follows CharF */
777: /* if ident&Char_label, label follows CharF ('\0'-terminated string) */
778: } CharF;
779:
780: Char *alloc_char(); /* in Text.c */
781: Char *append_char();
782: Char *insert_char();
783: Char *insert_char_word();
784: Char *dup_char();
785: Char *dup_char_etc();
786: Chars *append_chars();
787: Chars *dup_chars_etc();
788:
789: typedef struct Words {
790: int mny; /* mny==0 -> wpa==NULL */
791: struct Word **wpa; /* pts to NULL-term'd array of ptrs */
792: } Words;
793:
794: #define Init_Words {0,NULL}
795: #if MAIN
796: Words empty_Words = Init_Words;
797: #else
798: extern Words empty_Words;
799: #endif
800:
801: /* Word - one or more Chars lying in a textline close together.
802: wsp (word space) is always >=0 and is scaled by xheight (of Txtln) */
803: typedef struct Word {
804: Ident ident; /* feature bits */
805: Bbx bx;
806: float wsp; /* space before word (multiple of wst*em) */
807: Merit m; /* Word merit (function of its Char's merits) */
808: Prob p; /* probability */
809: Words ws; /* set of alternative segmentations */
810: Chars cs;
811: Blobs bs;
812: char *l; /* label (ASCII string in malloc space) */
813: int hash; /* hash key for fast equality checking */
814: } Word;
815:
816: #define Init_Word {IsWord,Init_Bbx,0.0,0.0,0.0,Init_Words,Init_Chars,Init_Blobs,NULL,0}
817: #if MAIN
818: Word empty_Word = Init_Word;
819: #else
820: extern Word empty_Word;
821: #endif
822:
823: /* Word-interpretation (as printable ASCII string).
824: All string fields must point to malloc-space strings.
825: */
826: typedef struct WordInterp {
827: Ident ident; /* identifies word type */
828: char *s; /* entire string = pp+by+po+ps */
829: char *pp; /* punctuation prefix */
830: char *by; /* body of word */
831: char *po; /* possessive ('s 'S) or negative (n't N'T) suffix */
832: char *ps; /* punctuation suffix */
833: } WordInterp;
834:
835: #define Init_WordInterp {IsWordInterp,NULL,NULL,NULL,NULL,NULL}
836: #if MAIN
837: WordInterp empty_WordInterp = Init_WordInterp;
838: #else
839: extern WordInterp empty_WordInterp;
840: #endif
841:
842: WordInterp *dup_wordinterp_etc();
843:
844: /* Word.ident & WordInterp.ident flags: */
845: #define Word_spelled 00000000001 /* by spells correctly */
846: #define Word_wf 00000000002 /* s is well-formed */
847: #define Word_ok 00000000004 /* s is ok ("acceptable") */
848: #define Word_numeric 00000000010 /* by is numeric */
849: #define Word_initcap 00000000020 /* by has initial capital */
850: #define Word_allcaps 00000000040 /* by is all caps */
851: #define Word_hyphens 00000000100 /* by has imbedded hyphens */
852: #define Word_slashes 00000000200 /* by has imbedded slashes */
853: #define Word_endsent 00000000400 /* end of sentence: ps has .!? */
854: #define Word_termhyp 00000001000 /* some interpretation ends with hyphen */
855: #define Word_label 00000002000 /* has an ASCII string label */
856: #define Word_allalp 00000004000 /* s is all alphabetic */
857: #define Word_bodalp 00000010000 /* by is all alphabetic */
858:
859: /* WordF - Word external file format */
860: typedef struct WordF {
861: Ident ident; /* feature bits: IsWord must be set */
862: Bbx bx;
863: float wsp; /* space before word (multiple of thr) */
864: float m; /* merit */
865: short wmny; /* no. Words (in Word.ws) to follow this Word */
866: short cmny; /* no. Chars to follow this Word */
867: short bmny; /* no. Blobs to follow this Word */
868: /* if ident&Word_label, label follows WordF ('\0'-terminated string) */
869: } WordF;
870:
871: /* constant pitch model for a Txtln */
872: typedef struct ConstPitch {
873: Ems w; /* Character pitch */
874: Scoor o; /* origin (one of the character break points) */
875: float r; /* max/min autocorrelation ratio - the larger the better */
876: } ConstPitch;
877:
878: #define Init_ConstPitch {0.0,0,0.0}
879: #if MAIN
880: ConstPitch empty_ConstPitch = Init_ConstPitch;
881: #else
882: extern ConstPitch empty_ConstPitch;
883: #endif
884:
885: typedef struct Txtlns {
886: short mny;
887: struct Txtln **lpa; /* to array of Txtln's (if mny==0, lpa==NULL) */
888: } Txtlns;
889:
890: #define Init_Txtlns {0,NULL}
891: #if MAIN
892: Txtlns empty_Txtlns = Init_Txtlns;
893: #else
894: extern Txtlns empty_Txtlns;
895: #endif
896:
897: /* Text Line */
898: #define Txtln_label 00000000004 /* has an ASCII string label */
899: #define Txtln_size 00000000002 /* dominant text size chosen */
900: #define Txtln_basl 00000000001 /* dominant baseline chosen */
901:
902: typedef struct Txtln {
903: Ident ident;
904: Bbx bx;
905: Pts size; /* dominant text size (0 means unknown) */
906: Scoor basl; /* dominant baseline (absolute y coordinate) */
907: short *proj; /* ptr to malloc space projection array */
908: ConstPitch *cp; /* ptr to malloc space constant-pitch model */
909: Merit m; /* merit */
910: Txtlns ls; /* alternative Txtln segmentations */
911: Words ws; /* sorted asc. on x */
912: Chars cs; /* sorted asc. on bx.a.x */
913: Blobs bs; /* misc. non-char blobs */
914: char *l; /* label (ASCII string in malloc space) */
915: } Txtln;
916:
917: #define Init_Txtln {IsTxtln,Init_Bbx,0.0,0,NULL,NULL,0.0,Init_Txtlns,Init_Words,Init_Chars,Init_Blobs,NULL}
918: #if MAIN
919: Txtln empty_Txtln = Init_Txtln;
920: #else
921: extern Txtln empty_Txtln;
922: #endif
923:
924: /* Text Line */
925: typedef struct TxtlnF {
926: Ident ident; /* IsTxtln must be set */
927: Bbx bx;
928: Scoor basl; /* baseline (absolute y coordinate) */
929: Pts size; /* text size (<=0.0 means none is known) */
930: short pmny; /* no. shorts in projection array to follow */
931: float m; /* merit */
932: short lmny; /* no. alternative txtlns to follow */
933: short wmny; /* no. words to follow */
934: short cmny; /* no. chars to follow */
935: int bmny; /* no. blobs to follow */
936: /* if ident&Txtln_label, label follows TxtlnF ('\0'-terminated string) */
937: } TxtlnF;
938:
939: /* blocks of text */
940: typedef struct Blocks {
941: short mny; /* if mny==0, bpa==NULL */
942: struct Block **bpa; /* to NULL-term'd array of ptrs */
943: } Blocks;
944:
945: #define Init_Blocks {0,NULL}
946: #if MAIN
947: Blocks empty_Blocks = Init_Blocks;
948: #else
949: extern Blocks empty_Blocks;
950: #endif
951:
952: /* block of text */
953: typedef struct Block {
954: Ident ident;
955: Bbx bx; /* bounding box of block */
956: Radians skew; /* skew angle (as correction to Page.skew) */
957: Radians shear; /* shear angle (as correction to Page.shear) */
958: Ems wst; /* word-space threshhold */
959: Merit m; /* merit */
960: Blocks bks; /* Blocks nested within this one */
961: Txtlns ls;
962: Words ws;
963: Chars cs;
964: Blobs bs;
965: char *l; /* label (ASCII string in malloc space) */
966: } Block;
967:
968: #define Init_Block {IsBlock,Init_Bbx,0.0,0.0,0.0,0.0,Init_Blocks,Init_Txtlns,Init_Words,Init_Chars,Init_Blobs,NULL}
969: #if MAIN
970: Block empty_Block = Init_Block;
971: #else
972: extern Block empty_Block;
973: #endif
974:
975: /* Ident bits for Blocks */
976: #define Block_wst 00000000001 /* word-space-thresh set up */
977: #define Block_label 00000000002 /* has an ASCII string label */
978:
979: #define Block_mb (1)
980:
981: /* block of text */
982: typedef struct BlockF {
983: Ident ident; /* IsBlock must be set */
984: Bbx bx;
985: Ems wst; /* word-space threshold */
986: float skew;
987: float shear;
988: #if Block_mb
989: float m;
990: short bkmny;
991: #endif
992: short lmny;
993: short wmny;
994: int cmny;
995: int bmny;
996: /* if ident&Block_label, label follows BlockF ('\0'-terminated string) */
997: } BlockF;
998:
999: /* page */
1000: typedef struct Page {
1001: Ident ident;
1002: Bbx bx; /* extreme indices in pixels */
1003: short res_x; /* resolution in pixels/inch: x & y */
1004: short res_y;
1005: Radians skew; /* skew angle */
1006: Radians shear; /* shear correction */
1007: Blocks bks; /* blocks */
1008: Txtlns ls; /* textlines (those not in any block) */
1009: Words ws; /* words (not in any textline) */
1010: Chars cs; /* chars (not in any word) */
1011: Blobs bs; /* blobs (not in any char) */
1012: char *l; /* label (ASCII string in malloc space) */
1013: } Page;
1014:
1015: #define Init_Page {IsPage,Init_Bbx,0,0,0.0,0.0,Init_Blocks,Init_Txtlns,Init_Words,Init_Chars,Init_Blobs,NULL}
1016: #if MAIN
1017: Page empty_Page = Init_Page;
1018: #else
1019: extern Page empty_Page;
1020: #endif
1021:
1022: #define Page_label 00000000001 /* has a label */
1023:
1024: /* Pages of text */
1025: typedef struct Pages {
1026: unsigned short mny; /* if mny==0, pa==NULL */
1027: struct Page **pa; /* to NULL-term'd array of ptrs */
1028: } Pages;
1029:
1030: #define Init_Pages {0,NULL}
1031: #if MAIN
1032: Pages empty_Pages = Init_Pages;
1033: #else
1034: extern Pages empty_Pages;
1035: #endif
1036:
1037: typedef struct PageF {
1038: Ident ident; /* IsPage bit must be set */
1039: short res_x,res_y; /* resolution in pixels/inch: x & y */
1040: Bbx bx; /* extreme indices in pixels */
1041: float skew; /* original page skew angle, radians */
1042: float shear; /* original page shear angle, radians */
1043: short bkmny; /* no. blocks */
1044: short lmny; /* no. textlines (not in any block) */
1045: short wmny; /* no. words (not in any textline) */
1046: int cmny; /* no. chars (not in any word) */
1047: int bmny; /* no. blobs (not in any char)) */
1048: /* if ident&Page_label, label follows PageF ('\0'-terminated string) */
1049: } PageF;
1050:
1051: #define Page_new 00000000001 /* Page is in ``new'' format */
1052:
1053: /* Each Dim-file begins with a Doc record */
1054: typedef struct Doc {
1055: unsigned short version; /* file format version number */
1056: Pages ps;
1057: char *l; /* ASCII label */
1058: } Doc;
1059:
1060: #define Init_Doc {0,Init_Pages,NULL}
1061: #if MAIN
1062: Doc empty_Doc = Init_Doc;
1063: #else
1064: extern Doc empty_Doc;
1065: #endif
1066:
1067: char *ident_toa();
1068: Ident cto_ident();
1069: Ident cto_flag();
1070: char *merit_toa();
1071: Page *alloc_page();
1072: char *page_toa();
1073: Page *dup_page();
1074: Page *dup_page_etc();
1075: Block *alloc_block();
1076: char *block_toa();
1077: Block *dup_block();
1078: Block *dup_block_etc();
1079: Block *append_block();
1080: Blocks *dup_blocks_etc();
1081: ConstPitch *alloc_constpitch();
1082: char *constpitch_toa();
1083: Txtln *alloc_txtln();
1084: char *txtln_toa();
1085: Word *alloc_word();
1086: char *word_toa();
1087: boolean eq_word();
1088: int hash_word();
1089: Char *alloc_char();
1090: char *char_toa();
1091: Pp *char_centroid();
1092: Char *char_of_blob();
1093: char *interp_toa();
1094: char *blob_toa();
1095: Pp *blob_centroid();
1096: char *runf_toa();
1097: char *runfs_toa();
1098: char *pp_toa();
1099: char *bdyedge_toa();
1100: Blob *dup_blob();
1101: Blob *dup_blob_etc();
1102: Blob *dup_blobl_etc();
1103: Blob *runs_to_runs();
1104: Blobs *dup_blobs_etc();
1105: Blobs *blobl_to_blobs();
1106: Interp *alloc_interp();
1107: Interp *dup_interp();
1108: Interpl *dup_interpl_etc();
1109: Interps *dup_interps_etc();
1110: Word *append_word();
1111: Word *insert_word();
1112: Word *insert_word_txtln();
1113: Word *dup_word();
1114: Word *dup_word_etc();
1115: Words *dup_words_etc();
1116: Txtln *dup_txtln();
1117: Txtln *dup_txtln_etc();
1118: Txtlns *dup_txtlns_etc();
1119: Block *dup_block();
1120: Block *dup_block_etc();
1121: Radians add_ang();
1122: Radians subtract_ang();
1123:
1124: /* in-line macroes */
1125:
1126: /* Merge the `source' Bbx into the `destination' Bbx, expanding the dest Bbx
1127: as required. The source Bbx is unmodified. Usage:
1128: merge_bbx(s,d)
1129: Bbx *s,*d;
1130: */
1131: #define merge_bbx(s,d) { \
1132: if((s)->a.x < (d)->a.x) (d)->a.x = (s)->a.x; \
1133: if((s)->a.y < (d)->a.y) (d)->a.y = (s)->a.y; \
1134: if((s)->b.x > (d)->b.x) (d)->b.x = (s)->b.x; \
1135: if((s)->b.y > (d)->b.y) (d)->b.y = (s)->b.y; \
1136: }
1137:
1138: /* Experimental implementation of a data structure for maintaining a set
1139: of distinct Words whose merit falls within a dynamically-varying range.
1140: This implementation is worst-case quadratic time.
1141: BUGS: insert_wordset shouldn't duplicate the word.
1142: */
1143:
1144: #define dbg_ws (0) /* if !=0, enable WordSet debugging tracing */
1145:
1146: typedef struct WordSet {
1147: double cut; /* cut-fraction */
1148: int cap; /* capacity: maximum number permitted at any time */
1149: double top; /* maximum merit seen since allocation */
1150: Words ws; /* sorted by top-choice merit */
1151: Word *max,*min; /* maximum/minimum entries currently in ws */
1152: int high; /* high-water: max. no. entries in history */
1153: } WordSet;
1154: #define Init_WordSet {1.0,INT_MAX,0.0,Init_Words,NULL,NULL,0}
1155: #if MAIN
1156: WordSet empty_WordSet = Init_WordSet;
1157: #else
1158: extern WordSet empty_WordSet;
1159: #endif
1160:
1161: #define size_wordset(s) ((s)->ws.mny)
1162: #define top_wordset(s) ((s)->top)
1163: #define max_wordset(s) ((s)->max)
1164: #define min_wordset(s) ((s)->min)
1165: #define max_wordmerit(s) ((max_wordset((s))!=NULL)? (max_wordset((s)))->m: 0.0)
1166: #define min_wordmerit(s) ((min_wordset((s))!=NULL)? (min_wordset((s)))->m: 0.0)
1167:
1168: Merit wordmerit();
1169: WordSet *alloc_wordset();
1170: boolean insert_wordset();
1171: Word *remove_wordset();
1172: int free_wordset_etc();
1173: err_wordset();
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.