|
|
1.1 root 1: #include <u.h>
2: #include <libc.h>
3: #include <ctype.h>
4: #ifdef plan9
5: Dir mbuf;
6: #else
7: #define print printf
8: struct stat mbuf;
9: #define OREAD 0
10: #endif
11: /*
12: * file - determine type of file
13: */
14:
15: uchar buf[6000];
16: short cfreq[140];
17: short wfreq[50];
18: int nbuf;
19: int flag;
20: int (*call[])(void);
21:
22: enum
23: {
24: Cword,
25: Fword,
26: Aword,
27: I1,
28: I2,
29: I3,
30: Clatin = 128,
31: Cbinary,
32: Cnull,
33: Ceascii,
34: };
35: struct
36: {
37: char* word;
38: int flag;
39: } dict[] =
40: {
41: "TEXT", Aword,
42: "block", Fword,
43: "char", Cword,
44: "common", Fword,
45: "data", Fword,
46: "dimension", Fword,
47: "double", Cword,
48: "extern", Cword,
49: "fio", I2,
50: "float", Cword,
51: "function", Fword,
52: "h", I3,
53: "include", I1,
54: "int", Cword,
55: "integer", Fword,
56: "libc", I2,
57: "long", Cword,
58: "real", Fword,
59: "register", Cword,
60: "short", Cword,
61: "static", Cword,
62: "stdio", I2,
63: "struct", Cword,
64: "subroutine", Fword,
65: "u", I2,
66: "void", Cword,
67: };
68:
69: enum
70: {
71: Short = 1<<0, /* size < 100 */
72: Long = 1<<1,
73:
74: Fascii = 1<<2, /* printable ascii */
75: Flatin = 1<<3,
76: Fbinary = 1<<4,
77: Feascii = 1<<5, /* including extended */
78: Fnull = 1<<6,
79: };
80:
81: void type(char*, int);
82: long lendian(uchar*);
83:
84: int
85: main(int argc, char *argv[])
86: {
87: int i, l;
88:
89: l = 0;
90: for(i=1; i<argc; i++)
91: if(strlen(argv[i]) > l)
92: l = strlen(argv[i]);
93: for(i=1; i<argc; i++)
94: type(argv[i], l);
95: exit(0);
96: }
97:
98: void
99: type(char *file, int nlen)
100: {
101: int i, f, l, m, c;
102: char *p, *ep, word[20];
103:
104: print("%s:%*s", file, nlen-strlen(file)+1, "");
105: #ifdef plan9
106: if(dirstat(file, &mbuf) < 0) {
107: print("cannot stat\n");
108: return;
109: }
110: if(mbuf.mode & CHDIR) {
111: print("directory\n");
112: return;
113: }
114: if(mbuf.type != 'M') {
115: print("special file #%c\n", mbuf.type);
116: return;
117: }
118: #else
119: if(stat(file, &mbuf) < 0) {
120: print("cannot stat\n");
121: return;
122: }
123: switch(mbuf.st_mode&S_IFMT) {
124: case S_IFDIR:
125: print("directory\n");
126: return;
127: case S_IFCHR:
128: print("character special file\n");
129: return;
130: case S_IFBLK:
131: print("block special file\n");
132: return;
133: }
134: #endif
135:
136: f = open(file, OREAD);
137: if(f < 0) {
138: print("cannot open\n");
139: return;
140: }
141: nbuf = read(f, buf, sizeof(buf));
142: close(f);
143:
144: if(nbuf < 0) {
145: print("cannot read\n");
146: return;
147: }
148: if(nbuf == 0) {
149: print("empty\n");
150: return;
151: }
152:
153: /*
154: * build histogram table
155: */
156: memset(cfreq, 0, sizeof(cfreq));
157: flag = 0;
158: if(nbuf > 100)
159: flag |= Long;
160: else
161: flag |= Short;
162:
163: for(i=0; i<nbuf; i++) {
164: f = buf[i] & 0xff;
165: if(f >= 128) {
166: if(f >= 128+32)
167: f = Clatin; /* latin */
168: else
169: f = Cbinary; /* not latin */
170: } else
171: if(!isprint(f) && !isspace(f))
172: if(f == 0)
173: f = Cnull;
174: else
175: f = Ceascii;
176: cfreq[f]++;
177: }
178:
179: /*
180: * gross classify
181: */
182: if(cfreq[Cbinary])
183: flag |= Fbinary;
184: else
185: if(cfreq[Clatin])
186: flag |= Flatin;
187: else
188: if(cfreq[Ceascii])
189: flag |= Feascii;
190: else
191: if(cfreq[Cnull])
192: flag |= Fnull;
193: else
194: flag |= Fascii;
195:
196: if(flag & Fnull) {
197: print("null\n");
198: return;
199: }
200:
201: /*
202: * lookup dictionary words
203: */
204: memset(wfreq, 0, sizeof(wfreq));
205: if(flag & Fascii) {
206: ep = word+sizeof(word)-2;
207: for(i=0; i<nbuf; i++) {
208: f = buf[i];
209: if(!isalpha(f))
210: continue;
211: p = word;
212: for(; i<nbuf; i++) {
213: f = buf[i];
214: if(!isalnum(f))
215: break;
216: *p++ = f;
217: if(p >= ep)
218: break;
219: }
220: *p = 0;
221: f = 0;
222: l = sizeof(dict)/sizeof(dict[0]);
223: for(;;) {
224: if(f >= l)
225: break;
226: m = (f+l)/2;
227: c = strcmp(dict[m].word, word);
228: if(c == 0) {
229: wfreq[dict[m].flag]++;
230: break;
231: }
232: if(c < 0)
233: f = m+1;
234: else
235: l = m;
236: }
237: }
238: }
239:
240: /*
241: * call individual classify routines
242: */
243: for(i=0; call[i]; i++)
244: if((*call[i])())
245: return;
246:
247: /*
248: * if all else fails,
249: * print out gross classification
250: */
251: if(flag & Short)
252: print("short ");
253: if(flag & Fascii)
254: print("ascii\n");
255: else
256: if(flag & Feascii)
257: print("extended ascii\n");
258: else
259: if(flag & Flatin)
260: print("latin ascii\n");
261: else
262: print("binary\n");
263: }
264:
265: long
266: lendian(uchar *p)
267: {
268:
269: return (p[0]) |
270: (p[1] << 8) |
271: (p[2] << 16) |
272: (p[3] << 24);
273: }
274:
275: int
276: long0(void)
277: {
278:
279: switch((unsigned)lendian(buf)) {
280: default:
281: return 0;
282:
283: case 0413:
284: print("demand paged ");
285:
286: case 0410:
287: print("pure ");
288: goto exec;
289:
290: case 0406:
291: print("mpx 68000 ");
292: goto exec;
293:
294: exec:
295: case 0407:
296: print("unix vax executable");
297: if(lendian(buf+4) != 0)
298: print(" not stripped");
299: print("\n");
300: break;
301:
302: case 0411:
303: print("jfr 411 executable\n");
304: break;
305:
306: case 0177555:
307: print("very old archive\n");
308: break;
309:
310: case 0177545:
311: print("old archive\n");
312: break;
313:
314: case 0135246: /* andrew/ehg */
315: print("view2d input file\n");
316: break;
317:
318: case 0135256: /* andrew */
319: print("apl file\n");
320: break;
321:
322: case 0164200: /* td */
323: print("Lucasfilm picture\n");
324: break;
325:
326: case 0600560:
327: print("mux downloadable file\n");
328: break;
329:
330: case 0x07010000:
331: print("68020 plan9 executable\n");
332: break;
333:
334: case 0x07040000:
335: print("mips plan9 executable\n");
336: break;
337:
338: case 0x97010000:
339: print("hobbit plan9 executable\n");
340: break;
341:
342: case 0xab020000:
343: print("sparc plan9 executable\n");
344: break;
345:
346: case 0xeb010000:
347: print("386 plan9 executable\n");
348: break;
349: case 0x0b1f1bdc:
350: print("daisy\n");
351: break;
352: case 0x64205300:
353: print("S data object\n");
354: break;
355: }
356: return 1;
357: }
358:
359: int
360: short0(void)
361: {
362:
363: switch(lendian(buf) & 0xffff) {
364: default:
365: return 0;
366:
367: case 070707:
368: print("cpio archive\n");
369: break;
370:
371: case 0x02f7:
372: print("tex dvi\n");
373: break;
374:
375: case 0405:
376: case 0407:
377: case 0410:
378: case 0411:
379: print("pdp-11 executable\n");
380: break;
381: case 0x0000:
382: print("bitmap\n");
383: break;
384: }
385: return 1;
386: }
387:
388: /*
389: * initial words to classify file
390: */
391: char* iwords[] =
392: {
393: "!<arch>\n__.SYMDEF",
394: "archive random library",
395: "!<arch>\n",
396: "archive",
397: "070707",
398: "cpio archive - ascii header",
399: "#FIG",
400: "fig ouput",
401: "#!/bin/echo",
402: "cyntax object file",
403: "#!/bin/rc",
404: "rc executable file",
405: "#!/bin/sh",
406: "sh executable file",
407: "%!",
408: "postscript",
409: "@document(",
410: "imagen",
411: "x T i300",
412: "troff output for i300",
413: "x T im300",
414: "troff output for im300",
415: "x T post",
416: "troff output for post",
417: "x T opost",
418: "troff output for opost",
419: "x T Latin1",
420: "troff output for Latin1",
421: "x T 202",
422: "troff output for 202",
423: "x T aps",
424: "troff output for aps",
425: 0,0
426: };
427:
428: int
429: istring(void)
430: {
431: int i, n;
432: char *p;
433:
434: for(i=0; p=iwords[i]; i+=2) {
435: n = strlen(p);
436: if(nbuf >= n && !strncmp((char*)buf, p, n)) {
437: print("%s\n", iwords[i+1]);
438: return 1;
439: }
440: }
441: if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */
442: for(i=5; i<nbuf; i++)
443: if(buf[i] == '\n')
444: break;
445: print("%.*s picture\n", i-5, buf+5);
446: return 1;
447: }
448: return 0;
449: }
450:
451: /*
452: * low entropy means encrypted
453: */
454: int
455: ismung(void)
456: {
457: int i, bucket[8];
458: float cs;
459:
460: if(nbuf < 64)
461: return 0;
462: memset(bucket, 0, sizeof(bucket));
463: for(i=0; i<64; i++)
464: bucket[(buf[i]>>5)&07] += 1;
465:
466: cs = 0.;
467: for(i=0; i<8; i++)
468: cs += (bucket[i]-8)*(bucket[i]-8);
469: cs /= 8.;
470: if(cs <= 24.322) {
471: if(buf[0]==037 && buf[1]==0235)
472: print("compressed\n");
473: else
474: print("encrypted\n");
475: return 1;
476: }
477: return 0;
478: }
479:
480: /*
481: * english by punctuation and frequencies
482: */
483: int
484: isenglish(void)
485: {
486: int i, vow, comm, rare, badpun, punct;
487: char *p;
488:
489: if(!(flag & (Fascii|Feascii)))
490: return 0;
491: badpun = 0;
492: punct = 0;
493: for(i=0; i<nbuf-1; i++)
494: switch(buf[i]) {
495: case '.':
496: case ',':
497: case ')':
498: case '%':
499: case ';':
500: case ':':
501: case '?':
502: punct++;
503: if(buf[i+1] != ' ' && buf[i+1] != '\n')
504: badpun++;
505: }
506: if(badpun*5 > punct)
507: return 0;
508: if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */
509: return 0;
510: if(2*cfreq[';'] > cfreq['e'])
511: return 0;
512:
513: vow = 0;
514: for(p="AEIOU"; *p; p++) {
515: vow += cfreq[*p];
516: vow += cfreq[tolower(*p)];
517: }
518: comm = 0;
519: for(p="ETAION"; *p; p++) {
520: comm += cfreq[*p];
521: comm += cfreq[tolower(*p)];
522: }
523: rare = 0;
524: for(p="VJKQXZ"; *p; p++) {
525: rare += cfreq[*p];
526: rare += cfreq[tolower(*p)];
527: }
528: if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
529: print("English text\n");
530: return 1;
531: }
532: return 0;
533: }
534:
535: int
536: isc(void)
537: {
538: int n;
539:
540: n = wfreq[I1];
541: /*
542: * includes
543: */
544: if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
545: goto yes;
546: /*
547: * declarations
548: */
549: if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
550: goto yes;
551: /*
552: * assignments
553: */
554: if(cfreq[';'] >= 10 && cfreq['='] >= 10)
555: goto yes;
556: return 0;
557:
558: yes:
559: print("c program text\n");
560: return 1;
561: }
562:
563: int
564: isas(void)
565: {
566:
567: /*
568: * includes
569: */
570: if(wfreq[Aword] >= 2)
571: goto yes;
572: return 0;
573:
574: yes:
575: print("assembler program text\n");
576: return 1;
577: }
578:
579: int
580: iscint(void)
581: {
582:
583: if(buf[0] == 0x3a) /* as = ANAME */
584: if(buf[1] == 0x11) /* type = D_FILE */
585: if(buf[2] == 1) /* sym */
586: if(buf[3] == '<') { /* name of file */
587: print("mips .v intermediate\n");
588: return 1;
589: }
590:
591: if(buf[0] == 0x4d) /* aslo = ANAME */
592: if(buf[1] == 0x01) /* ashi = ANAME */
593: if(buf[2] == 0x32) /* type = D_FILE */
594: if(buf[3] == 1) /* sym */
595: if(buf[4] == '<') { /* name of file */
596: print("68020 .2 intermediate\n");
597: return 1;
598: }
599:
600: if(buf[0] == 0x43) /* as = ANAME */
601: if(buf[1] == 0x0d) /* type */
602: if(buf[2] == 1) /* sym */
603: if(buf[3] == '<') { /* name of file */
604: print("hobbit .z intermediate\n");
605: return 1;
606: }
607:
608: if(buf[0] == 0x74) /* as = ANAME */
609: if(buf[1] == 0x10) /* type */
610: if(buf[2] == 1) /* sym */
611: if(buf[3] == '<') { /* name of file */
612: print("sparc .k intermediate\n");
613: return 1;
614: }
615:
616: if(buf[0] == 0x7e) /* aslo = ANAME */
617: if(buf[1] == 0x00) /* ashi = ANAME */
618: if(buf[2] == 0x45) /* type = D_FILE */
619: if(buf[3] == 1) /* sym */
620: if(buf[4] == '<') { /* name of file */
621: print("386 .8 intermediate\n");
622: return 1;
623: }
624:
625: return 0;
626: }
627:
628: /*
629: * pick up a number with
630: * syntax _*[0-9]+_
631: */
632: #define P9BITLEN 12
633: int
634: p9bitnum(uchar *bp)
635: {
636: int n, c, len;
637:
638: len = P9BITLEN;
639: while(*bp == ' ') {
640: bp++;
641: len--;
642: if(len <= 0)
643: return -1;
644: }
645: n = 0;
646: while(len > 1) {
647: c = *bp++;
648: if(!isdigit(c))
649: return -1;
650: n = n*10 + c-'0';
651: len--;
652: }
653: if(*bp != ' ')
654: return -1;
655: return n;
656: }
657:
658: int
659: isp9bit(void)
660: {
661: int ldep, lox, loy, hix, hiy;
662: long len;
663:
664: ldep = p9bitnum(buf + 0*P9BITLEN);
665: lox = p9bitnum(buf + 1*P9BITLEN);
666: loy = p9bitnum(buf + 2*P9BITLEN);
667: hix = p9bitnum(buf + 3*P9BITLEN);
668: hiy = p9bitnum(buf + 4*P9BITLEN);
669:
670: if(ldep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0)
671: return 0;
672:
673: len = (hix-lox) * (1<<ldep); /* row length */
674: len = (len + 7) / 8; /* rounded to bytes */
675: len *= (hiy-loy); /* col length */
676: len += 60; /* size of initial ascii */
677:
678: /*
679: * for regular file length is non-zero and must match calculation above
680: * for /dev/window and /dev/screen the length is always zero
681: */
682: #ifdef plan9
683: if(mbuf.length != len && mbuf.length != 0)
684: #else
685: if(mbuf.st_size != len && mbuf.st_size != 0)
686: #endif
687: return 0;
688: print("plan 9 bitmap\n");
689: return 1;
690: }
691:
692: int (*call[])(void) =
693: {
694: long0, /* recognizable by first 4 bytes */
695: short0, /* recognizable by first 2 bytes */
696: istring, /* recognizable by first string */
697: iscint, /* c intermediate */
698: isc, /* c compiler key words */
699: isas, /* assembler key words */
700: ismung, /* entropy compressed/encrypted */
701: isenglish, /* char frequency English */
702: isp9bit, /* plan 9 bitmap (as from /dev/window) */
703: 0
704: };
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.