Annotation of 43BSDReno/share/doc/usd/19.awk/awk, revision 1.1.1.1

1.1       root        1: .\"    @(#)awk 6.1 (Berkeley) 5/22/86
                      2: .\"
                      3: .EH 'USD:19-%''Awk \(em A Pattern Scanning and Processing Language'
                      4: .OH 'Awk \(em A Pattern Scanning and Processing Language''USD:19-%'
                      5: .\" .fp 3 G  no G on APS (use gb) or Dandelion Printer (use CW)
                      6: .\" the .T is only a ditroff feature...
                      7: .if '\*.T'dp' .fp 3 El
                      8: .if '\*.T'aps' .fp 3 gB
                      9: ....TM "78-1271-12, 78-1273-6" 39199 39199-11
                     10: .ND "September 1, 1978"
                     11: ....TR 68
                     12: .\".RP
                     13: .      \" macros here
                     14: .tr _\(em
                     15: .if t .tr ~\(ap
                     16: .tr |\(or
                     17: .tr *\(**
                     18: .de UC
                     19: \&\\$3\s-1\\$1\\s0\&\\$2
                     20: ..
                     21: .de IT
                     22: .if n .ul
                     23: \&\\$3\f2\\$1\fP\|\\$2
                     24: ..
                     25: .de UL
                     26: .if n .ul
                     27: \&\\$3\f3\\$1\fP\&\\$2
                     28: ..
                     29: .de P1
                     30: .DS I 3n
                     31: .nf
                     32: .if n .ta 5 10 15 20 25 30 35 40 45 50 55 60
                     33: .if t .ta .3i .6i .9i 1.2i
                     34: .if t .tr -\-'\(fm*\(**
                     35: .if t .tr _\(ul
                     36: .ft 3
                     37: .lg 0
                     38: .ss 18
                     39: .              \"use first argument as indent if present
                     40: ..
                     41: .de P2
                     42: .ps \\n(PS
                     43: .vs \\n(VSp
                     44: .ft R
                     45: .ss 12
                     46: .if n .ls 2
                     47: .tr --''``^^!!
                     48: .if t .tr _\(em
                     49: .fi
                     50: .lg
                     51: .DE
                     52: ..
                     53: .hw semi-colon
                     54: .hy 14
                     55: .              \"2=not last lines; 4= no -xx; 8=no xx-
                     56: .              \"special chars in programs
                     57: .de WS
                     58: .sp \\$1
                     59: ..
                     60: .      \" end of macros
                     61: .TL
                     62: Awk \(em A Pattern Scanning and Processing Language
                     63: .br
                     64: (Second Edition)
                     65: .AU "MH 2C-522" 4862
                     66: Alfred V. Aho
                     67: .AU "MH 2C-518" 6021
                     68: Brian W. Kernighan
                     69: .AU "MH 2C-514" 7214
                     70: Peter J. Weinberger
                     71: .AI
                     72: .MH
                     73: .AB
                     74: .IT Awk
                     75: is a programming language whose
                     76: basic operation
                     77: is to search a set of files
                     78: for patterns, and to perform specified actions upon lines or fields of lines which
                     79: contain instances of those patterns.
                     80: .IT Awk
                     81: makes certain data selection and transformation operations easy to express;
                     82: for example, the
                     83: .IT awk
                     84: program
                     85: .sp
                     86: .ce
                     87: .ft 3
                     88: length > 72
                     89: .ft
                     90: .sp
                     91: prints all input lines whose length exceeds 72 characters;
                     92: the program
                     93: .ce
                     94: .sp
                     95: .ft 3
                     96: NF % 2 == 0
                     97: .ft R
                     98: .sp
                     99: prints all lines with an even number of fields;
                    100: and the program
                    101: .ce
                    102: .sp
                    103: .ft 3
                    104: { $1 = log($1); print }
                    105: .ft R
                    106: .sp
                    107: replaces the first field of each line by its logarithm.
                    108: .PP
                    109: .IT Awk
                    110: patterns may include arbitrary boolean combinations of regular expressions
                    111: and of relational operators on strings, numbers, fields, variables, and array elements.
                    112: Actions may include the same pattern-matching constructions as in patterns,
                    113: as well as
                    114: arithmetic and string expressions and assignments,
                    115: .UL if-else ,
                    116: .UL while ,
                    117: .UL for
                    118: statements,
                    119: and multiple output streams.
                    120: .PP
                    121: This report contains a user's guide, a discussion of the design and implementation of
                    122: .IT awk ,
                    123: and some timing statistics.
                    124: ....It supersedes TM-77-1271-5, dated September 8, 1977.
                    125: .AE
                    126: .CS 6 1 7 0 1 4
                    127: .if n .ls 2
                    128: .nr PS 9
                    129: .nr VS 11
                    130: .NH
                    131: Introduction
                    132: .if t .2C
                    133: .PP
                    134: .IT Awk
                    135: is a programming language designed to make
                    136: many common
                    137: information retrieval and text manipulation tasks
                    138: easy to state and to perform.
                    139: .PP
                    140: The basic operation of
                    141: .IT awk
                    142: is to scan a set of input lines in order,
                    143: searching for lines which match any of a set of patterns
                    144: which the user has specified.
                    145: For each pattern, an action can be specified;
                    146: this action will be performed on each line that matches the pattern.
                    147: .PP
                    148: Readers familiar with the
                    149: .UX
                    150: program
                    151: .IT grep\|
                    152: .[
                    153: unix program manual
                    154: .]
                    155: will recognize
                    156: the approach, although in
                    157: .IT awk
                    158: the patterns may be more
                    159: general than in
                    160: .IT grep ,
                    161: and the actions allowed are more involved than merely
                    162: printing the matching line.
                    163: For example, the
                    164: .IT awk
                    165: program
                    166: .P1
                    167: {print $3, $2}
                    168: .P2
                    169: prints the third and second columns of a table
                    170: in that order.
                    171: The program
                    172: .P1
                    173: $2 ~ /A\||B\||C/
                    174: .P2
                    175: prints all input lines with an A, B, or C in the second field.
                    176: The program
                    177: .P1
                    178: $1 != prev     { print; prev = $1 }
                    179: .P2
                    180: prints all lines in which the first field is different
                    181: from the previous first field.
                    182: .NH 2
                    183: Usage
                    184: .PP
                    185: The command
                    186: .P1
                    187: awk  program  [files]
                    188: .P2
                    189: executes the
                    190: .IT awk
                    191: commands in
                    192: the string
                    193: .UL program
                    194: on the set of named files,
                    195: or on the standard input if there are no files.
                    196: The statements can also be placed in a file
                    197: .UL pfile ,
                    198: and executed by the command
                    199: .P1
                    200: awk  -f pfile  [files]
                    201: .P2
                    202: .NH 2
                    203: Program Structure
                    204: .PP
                    205: An
                    206: .IT awk
                    207: program is a sequence of statements of the form:
                    208: .P1
                    209: .ft I
                    210:        pattern { action }
                    211:        pattern { action }
                    212:        ...
                    213: .ft 3
                    214: .P2
                    215: Each line of input
                    216: is matched against
                    217: each of the patterns in turn.
                    218: For each pattern that matches, the associated action
                    219: is executed.
                    220: When all the patterns have been tested, the next line
                    221: is fetched and the matching starts over.
                    222: .PP
                    223: Either the pattern or the action may be left out,
                    224: but not both.
                    225: If there is no action for a pattern,
                    226: the matching line is simply
                    227: copied to the output.
                    228: (Thus a line which matches several patterns can be printed several times.)
                    229: If there is no pattern for an action,
                    230: then the action is performed for every input line.
                    231: A line which matches no pattern is ignored.
                    232: .PP
                    233: Since patterns and actions are both optional,
                    234: actions must be enclosed in braces
                    235: to distinguish them from patterns.
                    236: .NH 2
                    237: Records and Fields
                    238: .PP
                    239: .IT Awk
                    240: input is divided into
                    241: ``records'' terminated by a record separator.
                    242: The default record separator is a newline,
                    243: so by default
                    244: .IT awk
                    245: processes its input a line at a time.
                    246: The number of the current record is available in a variable
                    247: named
                    248: .UL NR .
                    249: .PP
                    250: Each input record
                    251: is considered to be divided into ``fields.''
                    252: Fields are normally separated by
                    253: white space \(em blanks or tabs \(em
                    254: but the input field separator may be changed, as described below.
                    255: Fields are referred to as
                    256: .UL "$1, $2,"
                    257: and so forth,
                    258: where
                    259: .UL $1
                    260: is the first field,
                    261: and
                    262: .UL $0
                    263: is the whole input record itself.
                    264: Fields may be assigned to.
                    265: The number of fields in the current record
                    266: is available in a variable named
                    267: .UL NF .
                    268: .PP
                    269: The variables
                    270: .UL FS
                    271: and
                    272: .UL RS
                    273: refer to the input field and record separators;
                    274: they may be changed at any time to any single character.
                    275: The optional command-line argument
                    276: \f3\-F\fIc\fR
                    277: may also be used to set
                    278: .UL FS
                    279: to the character
                    280: .IT c .
                    281: .PP
                    282: If the record separator is empty,
                    283: an empty input line is taken as the record separator,
                    284: and blanks, tabs and newlines are treated as field separators.
                    285: .PP
                    286: The variable
                    287: .UL FILENAME
                    288: contains the name of the current input file.
                    289: .NH 2
                    290: Printing
                    291: .PP
                    292: An action may have no pattern,
                    293: in which case the action is executed for
                    294: all
                    295: lines.
                    296: The simplest action is to print some or all of a record;
                    297: this is accomplished by the
                    298: .IT awk
                    299: command
                    300: .UL print .
                    301: The
                    302: .IT awk
                    303: program
                    304: .P1
                    305: { print }
                    306: .P2
                    307: prints each record, thus copying the input to the output intact.
                    308: More useful is to print a field or fields from each record.
                    309: For instance, 
                    310: .P1
                    311: print $2, $1
                    312: .P2
                    313: prints the first two fields in reverse order.
                    314: Items separated by a comma in the print statement will be separated by the current output field separator
                    315: when output.
                    316: Items not separated by commas will be concatenated,
                    317: so
                    318: .P1
                    319: print $1 $2
                    320: .P2
                    321: runs the first and second fields together.
                    322: .PP
                    323: The predefined variables
                    324: .UL NF
                    325: and
                    326: .UL NR
                    327: can be used;
                    328: for example
                    329: .P1
                    330: { print NR, NF, $0 }
                    331: .P2
                    332: prints each record preceded by the record number and the number of fields.
                    333: .PP
                    334: Output may be diverted to multiple files;
                    335: the program
                    336: .P1
                    337: { print $1 >"foo1"; print $2 >"foo2" }
                    338: .P2
                    339: writes the first field,
                    340: .UL $1 ,
                    341: on the file
                    342: .UL foo1 ,
                    343: and the second field on file
                    344: .UL foo2 .
                    345: The
                    346: .UL >>
                    347: notation can also be used:
                    348: .P1
                    349: print $1 >>"foo"
                    350: .P2
                    351: appends the output to the file
                    352: .UL foo .
                    353: (In each case,
                    354: the output files are
                    355: created if necessary.)
                    356: The file name can be a variable or a field as well as a constant;
                    357: for example,
                    358: .P1
                    359: print $1 >$2
                    360: .P2
                    361: uses the contents of field 2 as a file name.
                    362: .PP
                    363: Naturally there is a limit on the number of output files;
                    364: currently it is 10.
                    365: .PP
                    366: Similarly, output can be piped into another process
                    367: (on
                    368: .UC UNIX
                    369: only); for instance,
                    370: .P1
                    371: print | "mail bwk"
                    372: .P2
                    373: mails the output to
                    374: .UL bwk .
                    375: .PP
                    376: The variables
                    377: .UL OFS
                    378: and
                    379: .UL ORS
                    380: may be used to change the current
                    381: output field separator and output
                    382: record separator.
                    383: The output record separator is
                    384: appended to the output of the
                    385: .UL print
                    386: statement.
                    387: .PP
                    388: .IT Awk
                    389: also provides the
                    390: .UL printf
                    391: statement for output formatting:
                    392: .P1
                    393: printf format expr, expr, ...
                    394: .P2
                    395: formats the expressions in the list
                    396: according to the specification
                    397: in
                    398: .UL format
                    399: and prints them.
                    400: For example,
                    401: .P1
                    402: printf "%8.2f  %10ld\en", $1, $2
                    403: .P2
                    404: prints 
                    405: .UL $1
                    406: as a floating point number 8 digits wide,
                    407: with two after the decimal point,
                    408: and
                    409: .UL $2
                    410: as a 10-digit long decimal number,
                    411: followed by a newline.
                    412: No output separators are produced automatically;
                    413: you must add them yourself,
                    414: as in this example.
                    415: The version of
                    416: .UL printf
                    417: is identical to that used with C.
                    418: .[
                    419: C programm language prentice hall 1978
                    420: .]
                    421: .NH 1
                    422: Patterns
                    423: .PP
                    424: A pattern in front of an action acts as a selector
                    425: that determines whether the action is to be executed.
                    426: A variety of expressions may be used as patterns:
                    427: regular expressions,
                    428: arithmetic relational expressions,
                    429: string-valued expressions,
                    430: and arbitrary boolean
                    431: combinations of these.
                    432: .NH 2
                    433: BEGIN and END
                    434: .PP
                    435: The special pattern
                    436: .UL BEGIN
                    437: matches the beginning of the input,
                    438: before the first record is read.
                    439: The pattern
                    440: .UL END
                    441: matches the end of the input,
                    442: after the last record has been processed.
                    443: .UL BEGIN
                    444: and
                    445: .UL END
                    446: thus provide a way to gain control before and after processing,
                    447: for initialization and wrapup.
                    448: .PP
                    449: As an example, the field separator
                    450: can be set to a colon by
                    451: .P1
                    452: BEGIN  { FS = ":" }
                    453: .ft I
                    454: \&... rest of program ...
                    455: .ft 3
                    456: .P2
                    457: Or the input lines may be counted by
                    458: .P1
                    459: END  { print NR }
                    460: .P2
                    461: If
                    462: .UL BEGIN
                    463: is present, it must be the first pattern;
                    464: .UL END
                    465: must be the last if used.
                    466: .NH 2
                    467: Regular Expressions
                    468: .PP
                    469: The simplest regular expression is a literal string of characters
                    470: enclosed in slashes,
                    471: like
                    472: .P1
                    473: /smith/
                    474: .P2
                    475: This
                    476: is actually a complete
                    477: .IT awk
                    478: program which
                    479: will print all lines which contain any occurrence
                    480: of the name ``smith''.
                    481: If a line contains ``smith''
                    482: as part of a larger word,
                    483: it will also be printed, as in
                    484: .P1
                    485: blacksmithing
                    486: .P2
                    487: .PP
                    488: .IT Awk
                    489: regular expressions include the regular expression
                    490: forms found in
                    491: the
                    492: .UC UNIX
                    493: text editor
                    494: .IT ed\|
                    495: .[
                    496: unix program manual
                    497: .]
                    498: and
                    499: .IT grep
                    500: (without back-referencing).
                    501: In addition,
                    502: .IT awk
                    503: allows
                    504: parentheses for grouping, | for alternatives,
                    505: .UL +
                    506: for ``one or more'', and
                    507: .UL ?
                    508: for ``zero or one'',
                    509: all as in
                    510: .IT lex .
                    511: Character classes
                    512: may be abbreviated:
                    513: .UL [a\-zA\-Z0\-9]
                    514: is the set of all letters and digits.
                    515: As an example,
                    516: the
                    517: .IT awk
                    518: program
                    519: .P1
                    520: /[Aa]ho\||[Ww]einberger\||[Kk]ernighan/
                    521: .P2
                    522: will print all lines which contain any of the names
                    523: ``Aho,'' ``Weinberger'' or ``Kernighan,''
                    524: whether capitalized or not.
                    525: .PP
                    526: Regular expressions
                    527: (with the extensions listed above)
                    528: must be enclosed in slashes,
                    529: just as in
                    530: .IT ed
                    531: and
                    532: .IT sed .
                    533: Within a regular expression,
                    534: blanks and the regular expression
                    535: metacharacters are significant.
                    536: To turn of the magic meaning
                    537: of one of the regular expression characters,
                    538: precede it with a backslash.
                    539: An example is the pattern
                    540: .P1
                    541: /\|\e/\^.\^*\e//
                    542: .P2
                    543: which matches any string of characters
                    544: enclosed in slashes.
                    545: .PP
                    546: One can also specify that any field or variable
                    547: matches
                    548: a regular expression (or does not match it) with the operators
                    549: .UL ~
                    550: and
                    551: .UL !~ .
                    552: The program
                    553: .P1
                    554: $1 ~ /[jJ]ohn/
                    555: .P2
                    556: prints all lines where the first field matches ``john'' or ``John.''
                    557: Notice that this will also match ``Johnson'', ``St. Johnsbury'', and so on.
                    558: To restrict it to exactly
                    559: .UL [jJ]ohn ,
                    560: use
                    561: .P1
                    562: $1 ~ /^[jJ]ohn$/
                    563: .P2
                    564: The caret ^ refers to the beginning
                    565: of a line or field;
                    566: the dollar sign
                    567: .UL $
                    568: refers to the end.
                    569: .NH 2
                    570: Relational Expressions
                    571: .PP
                    572: An
                    573: .IT awk
                    574: pattern can be a relational expression
                    575: involving the usual relational operators
                    576: .UL < ,
                    577: .UL <= ,
                    578: .UL == ,
                    579: .UL != ,
                    580: .UL >= ,
                    581: and
                    582: .UL > .
                    583: An example is
                    584: .P1
                    585: $2 > $1 + 100
                    586: .P2
                    587: which selects lines where the second field
                    588: is at least 100 greater than the first field.
                    589: Similarly,
                    590: .P1
                    591: NF % 2 == 0
                    592: .P2
                    593: prints lines with an even number of fields.
                    594: .PP
                    595: In relational tests, if neither operand is numeric,
                    596: a string comparison is made;
                    597: otherwise it is numeric.
                    598: Thus,
                    599: .P1
                    600: $1 >= "s"
                    601: .P2
                    602: selects lines that begin with an
                    603: .UL s ,
                    604: .UL t ,
                    605: .UL u ,
                    606: etc.
                    607: In the absence of any other information,
                    608: fields are treated as strings, so
                    609: the program
                    610: .P1
                    611: $1 > $2
                    612: .P2
                    613: will perform a string comparison.
                    614: .NH 2
                    615: Combinations of Patterns
                    616: .PP
                    617: A pattern can be any boolean combination of patterns,
                    618: using the operators
                    619: .UL \||\||
                    620: (or),
                    621: .UL &&
                    622: (and), and
                    623: .UL !
                    624: (not).
                    625: For example,
                    626: .P1
                    627: $1 >= "s" && $1 < "t" && $1 != "smith"
                    628: .P2
                    629: selects lines where the first field begins with ``s'', but is not ``smith''.
                    630: .UL &&
                    631: and
                    632: .UL \||\||
                    633: guarantee that their operands
                    634: will be evaluated
                    635: from left to right;
                    636: evaluation stops as soon as the truth or falsehood
                    637: is determined.
                    638: .NH 2
                    639: Pattern Ranges
                    640: .PP
                    641: The ``pattern'' that selects an action may also
                    642: consist of two patterns separated by a comma, as in
                    643: .P1
                    644: pat1, pat2     { ... }
                    645: .P2
                    646: In this case, the action is performed for each line between
                    647: an occurrence of
                    648: .UL pat1
                    649: and the next occurrence of
                    650: .UL pat2
                    651: (inclusive).
                    652: For example,
                    653: .P1
                    654: /start/, /stop/
                    655: .P2
                    656: prints all lines between
                    657: .UL start
                    658: and
                    659: .UL stop ,
                    660: while
                    661: .P1
                    662: NR == 100, NR == 200 { ... }
                    663: .P2
                    664: does the action for lines 100 through 200
                    665: of the input.
                    666: .NH 1
                    667: Actions
                    668: .PP
                    669: An
                    670: .IT awk
                    671: action is a sequence of action statements
                    672: terminated by newlines or semicolons.
                    673: These action statements can be used to do a variety of
                    674: bookkeeping and string manipulating tasks.
                    675: .NH 2
                    676: Built-in Functions
                    677: .PP
                    678: .IT Awk
                    679: provides a ``length'' function
                    680: to compute the length of a string of characters.
                    681: This program prints each record,
                    682: preceded by its length:
                    683: .P1
                    684: {print length, $0}
                    685: .P2
                    686: .UL length
                    687: by itself is a ``pseudo-variable'' which
                    688: yields the length of the current record;
                    689: .UL length(argument)
                    690: is a function which yields the length of its argument,
                    691: as in
                    692: the equivalent
                    693: .P1
                    694: {print length($0), $0}
                    695: .P2
                    696: The argument may be any expression.
                    697: .PP
                    698: .IT Awk
                    699: also
                    700: provides the arithmetic functions
                    701: .UL sqrt ,
                    702: .UL log ,
                    703: .UL exp ,
                    704: and
                    705: .UL int ,
                    706: for
                    707: square root,
                    708: base
                    709: .IT e
                    710: logarithm,
                    711: exponential,
                    712: and integer part of their respective arguments.
                    713: .PP
                    714: The name of one of these built-in functions,
                    715: without argument or parentheses,
                    716: stands for the value of the function on the
                    717: whole record.
                    718: The program
                    719: .P1
                    720: length < 10 || length > 20
                    721: .P2
                    722: prints lines whose length
                    723: is less than 10 or greater
                    724: than 20.
                    725: .PP
                    726: The function
                    727: .UL substr(s,\ m,\ n)
                    728: produces the substring of
                    729: .UL s
                    730: that begins at position
                    731: .UL m
                    732: (origin 1)
                    733: and is at most
                    734: .UL n
                    735: characters long.
                    736: If
                    737: .UL n
                    738: is omitted, the substring goes to the end of
                    739: .UL s .
                    740: The function
                    741: .UL index(s1,\ s2)
                    742: returns the position where the string
                    743: .UL s2
                    744: occurs in
                    745: .UL s1 ,
                    746: or zero if it does not.
                    747: .PP
                    748: The function
                    749: .UL sprintf(f,\ e1,\ e2,\ ...)
                    750: produces the value of the expressions
                    751: .UL e1 ,
                    752: .UL e2 ,
                    753: etc.,
                    754: in the
                    755: .UL printf
                    756: format specified by
                    757: .UL f .
                    758: Thus, for example,
                    759: .P1
                    760: x = sprintf("%8.2f %10ld", $1, $2)
                    761: .P2
                    762: sets
                    763: .UL x
                    764: to the string produced by formatting
                    765: the values of
                    766: .UL $1
                    767: and
                    768: .UL $2 .
                    769: .NH 2
                    770: Variables, Expressions, and Assignments
                    771: .PP
                    772: .IT Awk
                    773: variables take on numeric (floating point)
                    774: or string values according to context.
                    775: For example, in
                    776: .P1
                    777: x = 1
                    778: .P2
                    779: .UL x
                    780: is clearly a number, while in
                    781: .P1
                    782: x = "smith"
                    783: .P2
                    784: it is clearly a string.
                    785: Strings are converted to numbers and
                    786: vice versa whenever context demands it.
                    787: For instance,
                    788: .P1
                    789: x = "3" + "4"
                    790: .P2
                    791: assigns 7 to
                    792: .UL x .
                    793: Strings which cannot be interpreted
                    794: as numbers in a numerical context
                    795: will generally have numeric value zero,
                    796: but it is unwise to count on this behavior.
                    797: .PP
                    798: By default, variables (other than built-ins) are initialized to the null string,
                    799: which has numerical value zero;
                    800: this eliminates the need for most
                    801: .UL BEGIN
                    802: sections.
                    803: For example, the sums of the first two fields can be computed by
                    804: .P1
                    805:        { s1 += $1; s2 += $2 }
                    806: END    { print s1, s2 }
                    807: .P2
                    808: .PP
                    809: Arithmetic is done internally in floating point.
                    810: The arithmetic operators are
                    811: .UL + ,
                    812: .UL \- ,
                    813: .UL \(** ,
                    814: .UL / ,
                    815: and
                    816: .UL %
                    817: (mod).
                    818: The C increment
                    819: .UL ++
                    820: and
                    821: decrement
                    822: .UL \-\-
                    823: operators are also available,
                    824: and so are the assignment operators
                    825: .UL += ,
                    826: .UL \-= ,
                    827: .UL *= ,
                    828: .UL /= ,
                    829: and
                    830: .UL %= .
                    831: These operators may all be used in expressions.
                    832: .NH 2
                    833: Field Variables
                    834: .PP
                    835: Fields in
                    836: .IT awk
                    837: share essentially all of the properties of variables _
                    838: they may be used in arithmetic or string operations,
                    839: and may be assigned to.
                    840: Thus one can
                    841: replace the first field with a sequence number like this:
                    842: .P1
                    843: { $1 = NR; print }
                    844: .P2
                    845: or
                    846: accumulate two fields into a third, like this:
                    847: .P1
                    848: { $1 = $2 + $3; print $0 }
                    849: .P2
                    850: or assign a string to a field:
                    851: .P1
                    852: { if ($3 > 1000)
                    853:        $3 = "too big"
                    854:   print
                    855: }
                    856: .P2
                    857: which replaces the third field by ``too big'' when it is,
                    858: and in any case prints the record.
                    859: .PP
                    860: Field references may be numerical expressions,
                    861: as in
                    862: .P1
                    863: { print $i, $(i+1), $(i+n) }
                    864: .P2
                    865: Whether a field is deemed numeric or string depends on context;
                    866: in ambiguous cases like
                    867: .P1
                    868: if ($1 == $2) ...
                    869: .P2
                    870: fields are treated as strings.
                    871: .PP
                    872: Each input line is split into fields automatically as necessary.
                    873: It is also possible to split any variable or string
                    874: into fields:
                    875: .P1
                    876: n = split(s, array, sep)
                    877: .P2
                    878: splits the
                    879: the string
                    880: .UL s
                    881: into
                    882: .UL array[1] ,
                    883: \&...,
                    884: .UL array[n] .
                    885: The number of elements found is returned.
                    886: If the
                    887: .UL sep
                    888: argument is provided, it is used as the field separator;
                    889: otherwise
                    890: .UL FS
                    891: is used as the separator.
                    892: .NH 2
                    893: String Concatenation
                    894: .PP
                    895: Strings may be concatenated.
                    896: For example
                    897: .P1
                    898: length($1 $2 $3)
                    899: .P2
                    900: returns the length of the first three fields.
                    901: Or in a
                    902: .UL print
                    903: statement,
                    904: .P1
                    905: print $1 " is " $2
                    906: .P2
                    907: prints
                    908: the two fields separated by `` is ''.
                    909: Variables and numeric expressions may also appear in concatenations.
                    910: .NH 2
                    911: Arrays
                    912: .PP
                    913: Array elements are not declared;
                    914: they spring into existence by being mentioned.
                    915: Subscripts may have
                    916: .ul
                    917: any
                    918: non-null
                    919: value, including non-numeric strings.
                    920: As an example of a conventional numeric subscript,
                    921: the statement
                    922: .P1
                    923: x[NR] = $0
                    924: .P2
                    925: assigns the current input record to
                    926: the
                    927: .UL NR -th
                    928: element of the array
                    929: .UL x .
                    930: In fact, it is possible in principle (though perhaps slow)
                    931: to process the entire input in a random order with the
                    932: .IT awk
                    933: program
                    934: .P1
                    935:        { x[NR] = $0 }
                    936: END    { \fI... program ...\fP }
                    937: .P2
                    938: The first action merely records each input line in
                    939: the array
                    940: .UL x .
                    941: .PP
                    942: Array elements may be named by non-numeric values,
                    943: which gives
                    944: .IT awk
                    945: a capability rather like the associative memory of
                    946: Snobol tables.
                    947: Suppose the input contains fields with values like
                    948: .UL apple ,
                    949: .UL orange ,
                    950: etc.
                    951: Then the program
                    952: .P1
                    953: /apple/        { x["apple"]++ }
                    954: /orange/       { x["orange"]++ }
                    955: END            { print x["apple"], x["orange"] }
                    956: .P2
                    957: increments counts for the named array elements,
                    958: and prints them at the end of the input.
                    959: .NH 2
                    960: Flow-of-Control Statements
                    961: .PP
                    962: .IT Awk
                    963: provides the basic flow-of-control statements
                    964: .UL if-else ,
                    965: .UL while ,
                    966: .UL for ,
                    967: and statement grouping with braces, as in C.
                    968: We showed the
                    969: .UL if
                    970: statement in section 3.3 without describing it.
                    971: The condition in parentheses is evaluated;
                    972: if it is true, the statement following the
                    973: .UL if
                    974: is done.
                    975: The
                    976: .UL else
                    977: part is optional.
                    978: .PP
                    979: The
                    980: .UL while
                    981: statement is exactly like that of C.
                    982: For example, to print all input fields one per line,
                    983: .P1
                    984: i = 1
                    985: while (i <= NF) {
                    986:        print $i
                    987:        ++i
                    988: }
                    989: .P2
                    990: .PP
                    991: The
                    992: .UL for
                    993: statement is also exactly that of C:
                    994: .P1
                    995: for (i = 1; i <= NF; i++)
                    996:        print $i
                    997: .P2
                    998: does the same job as the
                    999: .UL while
                   1000: statement above.
                   1001: .PP
                   1002: There is an alternate form of the
                   1003: .UL for
                   1004: statement which is suited for accessing the
                   1005: elements of an associative array:
                   1006: .P1
                   1007: for (i in array)
                   1008:        \fIstatement\f3
                   1009: .P2
                   1010: does
                   1011: .ul
                   1012: statement
                   1013: with 
                   1014: .UL i
                   1015: set in turn to each element of
                   1016: .UL array .
                   1017: The elements are accessed in an apparently random order.
                   1018: Chaos will ensue if 
                   1019: .UL i
                   1020: is altered, or if any new elements are
                   1021: accessed during the loop.
                   1022: .PP
                   1023: The expression in the condition part of an
                   1024: .UL if ,
                   1025: .UL while
                   1026: or
                   1027: .UL for
                   1028: can include relational operators like
                   1029: .UL < ,
                   1030: .UL <= ,
                   1031: .UL > ,
                   1032: .UL >= ,
                   1033: .UL ==
                   1034: (``is equal to''),
                   1035: and
                   1036: .UL !=
                   1037: (``not equal to'');
                   1038: regular expression matches with the match operators
                   1039: .UL ~
                   1040: and
                   1041: .UL !~ ;
                   1042: the logical operators
                   1043: .UL \||\|| ,
                   1044: .UL && ,
                   1045: and
                   1046: .UL ! ;
                   1047: and of course parentheses for grouping.
                   1048: .PP
                   1049: The
                   1050: .UL break
                   1051: statement causes an immediate exit
                   1052: from an enclosing
                   1053: .UL while
                   1054: or
                   1055: .UL for ;
                   1056: the
                   1057: .UL continue
                   1058: statement
                   1059: causes the next iteration to begin.
                   1060: .PP
                   1061: The statement
                   1062: .UL next
                   1063: causes
                   1064: .IT awk
                   1065: to skip immediately to
                   1066: the next record and begin scanning the patterns from the top.
                   1067: The statement
                   1068: .UL exit
                   1069: causes the program to behave as if the end of the input
                   1070: had occurred.
                   1071: .PP
                   1072: Comments may be placed in
                   1073: .IT awk
                   1074: programs:
                   1075: they begin with the character
                   1076: .UL #
                   1077: and end with the end of the line,
                   1078: as in
                   1079: .P1
                   1080: print x, y     # this is a comment
                   1081: .P2
                   1082: .NH
                   1083: Design
                   1084: .PP
                   1085: The
                   1086: .UX
                   1087: system
                   1088: already provides several programs that
                   1089: operate by passing input through a
                   1090: selection mechanism.
                   1091: .IT Grep ,
                   1092: the first and simplest, merely prints all lines which
                   1093: match a single specified pattern.
                   1094: .IT Egrep
                   1095: provides more general patterns, i.e., regular expressions
                   1096: in full generality;
                   1097: .IT fgrep
                   1098: searches for a set of keywords with a particularly fast algorithm.
                   1099: .IT Sed\|
                   1100: .[
                   1101: unix programm manual
                   1102: .]
                   1103: provides most of the editing facilities of
                   1104: the editor
                   1105: .IT ed  ,
                   1106: applied to a stream of input.
                   1107: None of these programs provides
                   1108: numeric capabilities,
                   1109: logical relations,
                   1110: or variables.
                   1111: .PP
                   1112: .IT Lex\|
                   1113: .[
                   1114: lesk lexical analyzer cstr
                   1115: .]
                   1116: provides general regular expression recognition capabilities,
                   1117: and, by serving as a C program generator,
                   1118: is essentially open-ended in its capabilities.
                   1119: The use of
                   1120: .IT lex ,
                   1121: however, requires a knowledge of C programming,
                   1122: and a
                   1123: .IT lex
                   1124: program must be compiled and loaded before use,
                   1125: which discourages its use for one-shot applications.
                   1126: .PP
                   1127: .IT Awk
                   1128: is an attempt
                   1129: to fill in another part of the matrix of possibilities.
                   1130: It
                   1131: provides general regular expression capabilities
                   1132: and an implicit input/output loop.
                   1133: But it also provides convenient numeric processing,
                   1134: variables,
                   1135: more general selection,
                   1136: and control flow in the actions.
                   1137: It
                   1138: does not require compilation or a knowledge of C.
                   1139: Finally,
                   1140: .IT awk
                   1141: provides
                   1142: a convenient way to access fields within lines;
                   1143: it is unique in this respect.
                   1144: .PP
                   1145: .IT Awk
                   1146: also tries to integrate strings and numbers
                   1147: completely,
                   1148: by treating all quantities as both string and numeric,
                   1149: deciding which representation is appropriate
                   1150: as late as possible.
                   1151: In most cases the user can simply ignore the differences.
                   1152: .PP
                   1153: Most of the effort in developing
                   1154: .I awk
                   1155: went into deciding what
                   1156: .I awk
                   1157: should or should not do
                   1158: (for instance, it doesn't do string substitution)
                   1159: and what the syntax should be
                   1160: (no explicit operator for concatenation)
                   1161: rather
                   1162: than on writing or debugging the code.
                   1163: We have tried
                   1164: to make the syntax powerful
                   1165: but easy to use and well adapted
                   1166: to scanning files.
                   1167: For example,
                   1168: the absence of declarations and implicit initializations,
                   1169: while probably a bad idea for a general-purpose programming language,
                   1170: is desirable in a language
                   1171: that is meant to be used for tiny programs
                   1172: that may even be composed on the command line.
                   1173: .PP
                   1174: In practice,
                   1175: .IT awk
                   1176: usage seems to fall into two broad categories.
                   1177: One is what might be called ``report generation'' \(em
                   1178: processing an input to extract counts,
                   1179: sums, sub-totals, etc.
                   1180: This also includes the writing of trivial
                   1181: data validation programs,
                   1182: such as verifying that a field contains only numeric information
                   1183: or that certain delimiters are properly balanced.
                   1184: The combination of textual and numeric processing is invaluable here.
                   1185: .PP
                   1186: A second area of use is as a data transformer,
                   1187: converting data from the form produced by one program
                   1188: into that expected by another.
                   1189: The simplest examples merely select fields, perhaps with rearrangements.
                   1190: .NH
                   1191: Implementation
                   1192: .PP
                   1193: The actual implementation of
                   1194: .IT awk
                   1195: uses the language development tools available
                   1196: on the
                   1197: .UC UNIX
                   1198: operating system.
                   1199: The grammar is specified with
                   1200: .IT yacc ;
                   1201: .[
                   1202: yacc johnson cstr
                   1203: .]
                   1204: the lexical analysis is done by
                   1205: .IT lex ;
                   1206: the regular expression recognizers are
                   1207: deterministic finite automata
                   1208: constructed directly from the expressions.
                   1209: An
                   1210: .IT awk
                   1211: program is translated into a 
                   1212: parse tree which is then directly executed
                   1213: by a simple interpreter.
                   1214: .PP
                   1215: .IT Awk
                   1216: was designed for ease of use rather than processing speed;
                   1217: the delayed evaluation of variable types
                   1218: and the necessity to break input
                   1219: into fields makes high speed difficult to achieve in any case.
                   1220: Nonetheless,
                   1221: the program has not proven to be unworkably slow.
                   1222: .PP
                   1223: Table I below shows the execution (user + system) time
                   1224: on a PDP-11/70 of
                   1225: the
                   1226: .UC UNIX
                   1227: programs
                   1228: .IT wc ,
                   1229: .IT grep ,
                   1230: .IT egrep ,
                   1231: .IT fgrep ,
                   1232: .IT sed ,
                   1233: .IT lex ,
                   1234: and
                   1235: .IT awk
                   1236: on the following simple tasks:
                   1237: .IP "\ \ 1."
                   1238: count the number of lines.
                   1239: .IP "\ \ 2."
                   1240: print all lines containing ``doug''.
                   1241: .IP "\ \ 3."
                   1242: print all lines containing ``doug'', ``ken'' or ``dmr''.
                   1243: .IP "\ \ 4."
                   1244: print the third field of each line.
                   1245: .IP "\ \ 5."
                   1246: print the third and second fields of each line, in that order.
                   1247: .IP "\ \ 6."
                   1248: append all lines containing ``doug'', ``ken'', and ``dmr''
                   1249: to files ``jdoug'', ``jken'', and ``jdmr'', respectively.
                   1250: .IP "\ \ 7."
                   1251: print each line prefixed by ``line-number\ :\ ''.
                   1252: .IP "\ \ 8."
                   1253: sum the fourth column of a table.
                   1254: .LP
                   1255: The program
                   1256: .IT wc
                   1257: merely counts words, lines and characters in its input;
                   1258: we have already mentioned the others.
                   1259: In all cases the input was a file containing
                   1260: 10,000 lines
                   1261: as created by the
                   1262: command
                   1263: .IT "ls \-l" ;
                   1264: each line has the form
                   1265: .P1
                   1266: -rw-rw-rw- 1 ava 123 Oct 15 17:05 xxx
                   1267: .P2
                   1268: The total length of this input is
                   1269: 452,960 characters.
                   1270: Times for
                   1271: .IT lex
                   1272: do not include compile or load.
                   1273: .PP
                   1274: As might be expected,
                   1275: .IT awk
                   1276: is not as fast as the specialized tools
                   1277: .IT wc ,
                   1278: .IT sed ,
                   1279: or the programs in the
                   1280: .IT grep
                   1281: family,
                   1282: but
                   1283: is faster than the more general tool
                   1284: .IT lex .
                   1285: In all cases, the tasks were
                   1286: about as easy to express as
                   1287: .IT awk
                   1288: programs
                   1289: as programs in these other languages;
                   1290: tasks involving fields were
                   1291: considerably easier to express as
                   1292: .IT awk
                   1293: programs.
                   1294: Some of the test programs are shown in
                   1295: .IT awk ,
                   1296: .IT sed
                   1297: and
                   1298: .IT lex .
                   1299: .[
                   1300: $LIST$
                   1301: .]
                   1302: .1C
                   1303: .TS
                   1304: center;
                   1305: c c c c c c c c c
                   1306: c c c c c c c c c
                   1307: c|n|n|n|n|n|n|n|n|.
                   1308:                                Task
                   1309: Program        1       2       3       4       5       6       7       8
                   1310: _
                   1311: \fIwc\fR       8.6
                   1312: \fIgrep\fR     11.7    13.1
                   1313: \fIegrep\fR    6.2     11.5    11.6
                   1314: \fIfgrep\fR    7.7     13.8    16.1
                   1315: \fIsed\fR      10.2    11.6    15.8    29.0    30.5    16.1
                   1316: \fIlex\fR      65.1    150.1   144.2   67.7    70.3    104.0   81.7    92.8
                   1317: \fIawk\fR      15.0    25.6    29.9    33.3    38.9    46.4    71.4    31.1
                   1318: _
                   1319: .TE
                   1320: .sp
                   1321: .ce
                   1322: \fBTable I.\fR  Execution Times of Programs. (Times are in sec.)
                   1323: .sp 2
                   1324: .2C
                   1325: .PP
                   1326: The programs for some of these jobs are shown below.
                   1327: The
                   1328: .IT lex
                   1329: programs are generally too long to show.
                   1330: .LP
                   1331: AWK:
                   1332: .LP
                   1333: .P1
                   1334: 1.     END     {print NR}
                   1335: .P2
                   1336: .P1
                   1337: 2.     /doug/
                   1338: .P2
                   1339: .P1
                   1340: 3.     /ken|doug|dmr/
                   1341: .P2
                   1342: .P1
                   1343: 4.     {print $3}
                   1344: .P2
                   1345: .P1
                   1346: 5.     {print $3, $2}
                   1347: .P2
                   1348: .P1
                   1349: 6.     /ken/   {print >"jken"}
                   1350:        /doug/  {print >"jdoug"}
                   1351:        /dmr/   {print >"jdmr"}
                   1352: .P2
                   1353: .P1
                   1354: 7.     {print NR ": " $0}
                   1355: .P2
                   1356: .P1
                   1357: 8.             {sum = sum + $4}
                   1358:        END     {print sum}
                   1359: .P2
                   1360: .LP
                   1361: SED:
                   1362: .LP
                   1363: .P1
                   1364: 1.     $=
                   1365: .P2
                   1366: .P1
                   1367: 2.     /doug/p
                   1368: .P2
                   1369: .P1
                   1370: 3.     /doug/p
                   1371:        /doug/d
                   1372:        /ken/p
                   1373:        /ken/d
                   1374:        /dmr/p
                   1375:        /dmr/d
                   1376: .P2
                   1377: .P1
                   1378: 4.     /[^ ]* [ ]*[^ ]* [ ]*\e([^ ]*\e) .*/s//\e1/p
                   1379: .P2
                   1380: .P1
                   1381: 5.     /[^ ]* [ ]*\e([^ ]*\e) [ ]*\e([^ ]*\e) .*/s//\e2 \e1/p
                   1382: .P2
                   1383: .P1
                   1384: 6.     /ken/w jken
                   1385:        /doug/w jdoug
                   1386:        /dmr/w jdmr
                   1387: .P2
                   1388: .LP
                   1389: LEX:
                   1390: .LP
                   1391: .P1
                   1392: 1.     %{
                   1393:        int i;
                   1394:        %}
                   1395:        %%
                   1396:        \en     i++;
                   1397:        .       ;
                   1398:        %%
                   1399:        yywrap() {
                   1400:                printf("%d\en", i);
                   1401:        }
                   1402: .P2
                   1403: .P1
                   1404: 2.     %%
                   1405:        ^.*doug.*$      printf("%s\en", yytext);
                   1406:        .       ;
                   1407:        \en     ;
                   1408: .P2

unix.superglobalmegacorp.com

This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.