Annotation of 43BSDReno/share/doc/usd/19.awk/awk, revision 1.1

1.1     ! root        1: .\"    @(#)awk 6.1 (Berkeley) 5/22/86
        !             2: .\"
        !             3: .EH 'USD:19-%''Awk \(em A Pattern Scanning and Processing Language'
        !             4: .OH 'Awk \(em A Pattern Scanning and Processing Language''USD:19-%'
        !             5: .\" .fp 3 G  no G on APS (use gb) or Dandelion Printer (use CW)
        !             6: .\" the .T is only a ditroff feature...
        !             7: .if '\*.T'dp' .fp 3 El
        !             8: .if '\*.T'aps' .fp 3 gB
        !             9: ....TM "78-1271-12, 78-1273-6" 39199 39199-11
        !            10: .ND "September 1, 1978"
        !            11: ....TR 68
        !            12: .\".RP
        !            13: .      \" macros here
        !            14: .tr _\(em
        !            15: .if t .tr ~\(ap
        !            16: .tr |\(or
        !            17: .tr *\(**
        !            18: .de UC
        !            19: \&\\$3\s-1\\$1\\s0\&\\$2
        !            20: ..
        !            21: .de IT
        !            22: .if n .ul
        !            23: \&\\$3\f2\\$1\fP\|\\$2
        !            24: ..
        !            25: .de UL
        !            26: .if n .ul
        !            27: \&\\$3\f3\\$1\fP\&\\$2
        !            28: ..
        !            29: .de P1
        !            30: .DS I 3n
        !            31: .nf
        !            32: .if n .ta 5 10 15 20 25 30 35 40 45 50 55 60
        !            33: .if t .ta .3i .6i .9i 1.2i
        !            34: .if t .tr -\-'\(fm*\(**
        !            35: .if t .tr _\(ul
        !            36: .ft 3
        !            37: .lg 0
        !            38: .ss 18
        !            39: .              \"use first argument as indent if present
        !            40: ..
        !            41: .de P2
        !            42: .ps \\n(PS
        !            43: .vs \\n(VSp
        !            44: .ft R
        !            45: .ss 12
        !            46: .if n .ls 2
        !            47: .tr --''``^^!!
        !            48: .if t .tr _\(em
        !            49: .fi
        !            50: .lg
        !            51: .DE
        !            52: ..
        !            53: .hw semi-colon
        !            54: .hy 14
        !            55: .              \"2=not last lines; 4= no -xx; 8=no xx-
        !            56: .              \"special chars in programs
        !            57: .de WS
        !            58: .sp \\$1
        !            59: ..
        !            60: .      \" end of macros
        !            61: .TL
        !            62: Awk \(em A Pattern Scanning and Processing Language
        !            63: .br
        !            64: (Second Edition)
        !            65: .AU "MH 2C-522" 4862
        !            66: Alfred V. Aho
        !            67: .AU "MH 2C-518" 6021
        !            68: Brian W. Kernighan
        !            69: .AU "MH 2C-514" 7214
        !            70: Peter J. Weinberger
        !            71: .AI
        !            72: .MH
        !            73: .AB
        !            74: .IT Awk
        !            75: is a programming language whose
        !            76: basic operation
        !            77: is to search a set of files
        !            78: for patterns, and to perform specified actions upon lines or fields of lines which
        !            79: contain instances of those patterns.
        !            80: .IT Awk
        !            81: makes certain data selection and transformation operations easy to express;
        !            82: for example, the
        !            83: .IT awk
        !            84: program
        !            85: .sp
        !            86: .ce
        !            87: .ft 3
        !            88: length > 72
        !            89: .ft
        !            90: .sp
        !            91: prints all input lines whose length exceeds 72 characters;
        !            92: the program
        !            93: .ce
        !            94: .sp
        !            95: .ft 3
        !            96: NF % 2 == 0
        !            97: .ft R
        !            98: .sp
        !            99: prints all lines with an even number of fields;
        !           100: and the program
        !           101: .ce
        !           102: .sp
        !           103: .ft 3
        !           104: { $1 = log($1); print }
        !           105: .ft R
        !           106: .sp
        !           107: replaces the first field of each line by its logarithm.
        !           108: .PP
        !           109: .IT Awk
        !           110: patterns may include arbitrary boolean combinations of regular expressions
        !           111: and of relational operators on strings, numbers, fields, variables, and array elements.
        !           112: Actions may include the same pattern-matching constructions as in patterns,
        !           113: as well as
        !           114: arithmetic and string expressions and assignments,
        !           115: .UL if-else ,
        !           116: .UL while ,
        !           117: .UL for
        !           118: statements,
        !           119: and multiple output streams.
        !           120: .PP
        !           121: This report contains a user's guide, a discussion of the design and implementation of
        !           122: .IT awk ,
        !           123: and some timing statistics.
        !           124: ....It supersedes TM-77-1271-5, dated September 8, 1977.
        !           125: .AE
        !           126: .CS 6 1 7 0 1 4
        !           127: .if n .ls 2
        !           128: .nr PS 9
        !           129: .nr VS 11
        !           130: .NH
        !           131: Introduction
        !           132: .if t .2C
        !           133: .PP
        !           134: .IT Awk
        !           135: is a programming language designed to make
        !           136: many common
        !           137: information retrieval and text manipulation tasks
        !           138: easy to state and to perform.
        !           139: .PP
        !           140: The basic operation of
        !           141: .IT awk
        !           142: is to scan a set of input lines in order,
        !           143: searching for lines which match any of a set of patterns
        !           144: which the user has specified.
        !           145: For each pattern, an action can be specified;
        !           146: this action will be performed on each line that matches the pattern.
        !           147: .PP
        !           148: Readers familiar with the
        !           149: .UX
        !           150: program
        !           151: .IT grep\|
        !           152: .[
        !           153: unix program manual
        !           154: .]
        !           155: will recognize
        !           156: the approach, although in
        !           157: .IT awk
        !           158: the patterns may be more
        !           159: general than in
        !           160: .IT grep ,
        !           161: and the actions allowed are more involved than merely
        !           162: printing the matching line.
        !           163: For example, the
        !           164: .IT awk
        !           165: program
        !           166: .P1
        !           167: {print $3, $2}
        !           168: .P2
        !           169: prints the third and second columns of a table
        !           170: in that order.
        !           171: The program
        !           172: .P1
        !           173: $2 ~ /A\||B\||C/
        !           174: .P2
        !           175: prints all input lines with an A, B, or C in the second field.
        !           176: The program
        !           177: .P1
        !           178: $1 != prev     { print; prev = $1 }
        !           179: .P2
        !           180: prints all lines in which the first field is different
        !           181: from the previous first field.
        !           182: .NH 2
        !           183: Usage
        !           184: .PP
        !           185: The command
        !           186: .P1
        !           187: awk  program  [files]
        !           188: .P2
        !           189: executes the
        !           190: .IT awk
        !           191: commands in
        !           192: the string
        !           193: .UL program
        !           194: on the set of named files,
        !           195: or on the standard input if there are no files.
        !           196: The statements can also be placed in a file
        !           197: .UL pfile ,
        !           198: and executed by the command
        !           199: .P1
        !           200: awk  -f pfile  [files]
        !           201: .P2
        !           202: .NH 2
        !           203: Program Structure
        !           204: .PP
        !           205: An
        !           206: .IT awk
        !           207: program is a sequence of statements of the form:
        !           208: .P1
        !           209: .ft I
        !           210:        pattern { action }
        !           211:        pattern { action }
        !           212:        ...
        !           213: .ft 3
        !           214: .P2
        !           215: Each line of input
        !           216: is matched against
        !           217: each of the patterns in turn.
        !           218: For each pattern that matches, the associated action
        !           219: is executed.
        !           220: When all the patterns have been tested, the next line
        !           221: is fetched and the matching starts over.
        !           222: .PP
        !           223: Either the pattern or the action may be left out,
        !           224: but not both.
        !           225: If there is no action for a pattern,
        !           226: the matching line is simply
        !           227: copied to the output.
        !           228: (Thus a line which matches several patterns can be printed several times.)
        !           229: If there is no pattern for an action,
        !           230: then the action is performed for every input line.
        !           231: A line which matches no pattern is ignored.
        !           232: .PP
        !           233: Since patterns and actions are both optional,
        !           234: actions must be enclosed in braces
        !           235: to distinguish them from patterns.
        !           236: .NH 2
        !           237: Records and Fields
        !           238: .PP
        !           239: .IT Awk
        !           240: input is divided into
        !           241: ``records'' terminated by a record separator.
        !           242: The default record separator is a newline,
        !           243: so by default
        !           244: .IT awk
        !           245: processes its input a line at a time.
        !           246: The number of the current record is available in a variable
        !           247: named
        !           248: .UL NR .
        !           249: .PP
        !           250: Each input record
        !           251: is considered to be divided into ``fields.''
        !           252: Fields are normally separated by
        !           253: white space \(em blanks or tabs \(em
        !           254: but the input field separator may be changed, as described below.
        !           255: Fields are referred to as
        !           256: .UL "$1, $2,"
        !           257: and so forth,
        !           258: where
        !           259: .UL $1
        !           260: is the first field,
        !           261: and
        !           262: .UL $0
        !           263: is the whole input record itself.
        !           264: Fields may be assigned to.
        !           265: The number of fields in the current record
        !           266: is available in a variable named
        !           267: .UL NF .
        !           268: .PP
        !           269: The variables
        !           270: .UL FS
        !           271: and
        !           272: .UL RS
        !           273: refer to the input field and record separators;
        !           274: they may be changed at any time to any single character.
        !           275: The optional command-line argument
        !           276: \f3\-F\fIc\fR
        !           277: may also be used to set
        !           278: .UL FS
        !           279: to the character
        !           280: .IT c .
        !           281: .PP
        !           282: If the record separator is empty,
        !           283: an empty input line is taken as the record separator,
        !           284: and blanks, tabs and newlines are treated as field separators.
        !           285: .PP
        !           286: The variable
        !           287: .UL FILENAME
        !           288: contains the name of the current input file.
        !           289: .NH 2
        !           290: Printing
        !           291: .PP
        !           292: An action may have no pattern,
        !           293: in which case the action is executed for
        !           294: all
        !           295: lines.
        !           296: The simplest action is to print some or all of a record;
        !           297: this is accomplished by the
        !           298: .IT awk
        !           299: command
        !           300: .UL print .
        !           301: The
        !           302: .IT awk
        !           303: program
        !           304: .P1
        !           305: { print }
        !           306: .P2
        !           307: prints each record, thus copying the input to the output intact.
        !           308: More useful is to print a field or fields from each record.
        !           309: For instance, 
        !           310: .P1
        !           311: print $2, $1
        !           312: .P2
        !           313: prints the first two fields in reverse order.
        !           314: Items separated by a comma in the print statement will be separated by the current output field separator
        !           315: when output.
        !           316: Items not separated by commas will be concatenated,
        !           317: so
        !           318: .P1
        !           319: print $1 $2
        !           320: .P2
        !           321: runs the first and second fields together.
        !           322: .PP
        !           323: The predefined variables
        !           324: .UL NF
        !           325: and
        !           326: .UL NR
        !           327: can be used;
        !           328: for example
        !           329: .P1
        !           330: { print NR, NF, $0 }
        !           331: .P2
        !           332: prints each record preceded by the record number and the number of fields.
        !           333: .PP
        !           334: Output may be diverted to multiple files;
        !           335: the program
        !           336: .P1
        !           337: { print $1 >"foo1"; print $2 >"foo2" }
        !           338: .P2
        !           339: writes the first field,
        !           340: .UL $1 ,
        !           341: on the file
        !           342: .UL foo1 ,
        !           343: and the second field on file
        !           344: .UL foo2 .
        !           345: The
        !           346: .UL >>
        !           347: notation can also be used:
        !           348: .P1
        !           349: print $1 >>"foo"
        !           350: .P2
        !           351: appends the output to the file
        !           352: .UL foo .
        !           353: (In each case,
        !           354: the output files are
        !           355: created if necessary.)
        !           356: The file name can be a variable or a field as well as a constant;
        !           357: for example,
        !           358: .P1
        !           359: print $1 >$2
        !           360: .P2
        !           361: uses the contents of field 2 as a file name.
        !           362: .PP
        !           363: Naturally there is a limit on the number of output files;
        !           364: currently it is 10.
        !           365: .PP
        !           366: Similarly, output can be piped into another process
        !           367: (on
        !           368: .UC UNIX
        !           369: only); for instance,
        !           370: .P1
        !           371: print | "mail bwk"
        !           372: .P2
        !           373: mails the output to
        !           374: .UL bwk .
        !           375: .PP
        !           376: The variables
        !           377: .UL OFS
        !           378: and
        !           379: .UL ORS
        !           380: may be used to change the current
        !           381: output field separator and output
        !           382: record separator.
        !           383: The output record separator is
        !           384: appended to the output of the
        !           385: .UL print
        !           386: statement.
        !           387: .PP
        !           388: .IT Awk
        !           389: also provides the
        !           390: .UL printf
        !           391: statement for output formatting:
        !           392: .P1
        !           393: printf format expr, expr, ...
        !           394: .P2
        !           395: formats the expressions in the list
        !           396: according to the specification
        !           397: in
        !           398: .UL format
        !           399: and prints them.
        !           400: For example,
        !           401: .P1
        !           402: printf "%8.2f  %10ld\en", $1, $2
        !           403: .P2
        !           404: prints 
        !           405: .UL $1
        !           406: as a floating point number 8 digits wide,
        !           407: with two after the decimal point,
        !           408: and
        !           409: .UL $2
        !           410: as a 10-digit long decimal number,
        !           411: followed by a newline.
        !           412: No output separators are produced automatically;
        !           413: you must add them yourself,
        !           414: as in this example.
        !           415: The version of
        !           416: .UL printf
        !           417: is identical to that used with C.
        !           418: .[
        !           419: C programm language prentice hall 1978
        !           420: .]
        !           421: .NH 1
        !           422: Patterns
        !           423: .PP
        !           424: A pattern in front of an action acts as a selector
        !           425: that determines whether the action is to be executed.
        !           426: A variety of expressions may be used as patterns:
        !           427: regular expressions,
        !           428: arithmetic relational expressions,
        !           429: string-valued expressions,
        !           430: and arbitrary boolean
        !           431: combinations of these.
        !           432: .NH 2
        !           433: BEGIN and END
        !           434: .PP
        !           435: The special pattern
        !           436: .UL BEGIN
        !           437: matches the beginning of the input,
        !           438: before the first record is read.
        !           439: The pattern
        !           440: .UL END
        !           441: matches the end of the input,
        !           442: after the last record has been processed.
        !           443: .UL BEGIN
        !           444: and
        !           445: .UL END
        !           446: thus provide a way to gain control before and after processing,
        !           447: for initialization and wrapup.
        !           448: .PP
        !           449: As an example, the field separator
        !           450: can be set to a colon by
        !           451: .P1
        !           452: BEGIN  { FS = ":" }
        !           453: .ft I
        !           454: \&... rest of program ...
        !           455: .ft 3
        !           456: .P2
        !           457: Or the input lines may be counted by
        !           458: .P1
        !           459: END  { print NR }
        !           460: .P2
        !           461: If
        !           462: .UL BEGIN
        !           463: is present, it must be the first pattern;
        !           464: .UL END
        !           465: must be the last if used.
        !           466: .NH 2
        !           467: Regular Expressions
        !           468: .PP
        !           469: The simplest regular expression is a literal string of characters
        !           470: enclosed in slashes,
        !           471: like
        !           472: .P1
        !           473: /smith/
        !           474: .P2
        !           475: This
        !           476: is actually a complete
        !           477: .IT awk
        !           478: program which
        !           479: will print all lines which contain any occurrence
        !           480: of the name ``smith''.
        !           481: If a line contains ``smith''
        !           482: as part of a larger word,
        !           483: it will also be printed, as in
        !           484: .P1
        !           485: blacksmithing
        !           486: .P2
        !           487: .PP
        !           488: .IT Awk
        !           489: regular expressions include the regular expression
        !           490: forms found in
        !           491: the
        !           492: .UC UNIX
        !           493: text editor
        !           494: .IT ed\|
        !           495: .[
        !           496: unix program manual
        !           497: .]
        !           498: and
        !           499: .IT grep
        !           500: (without back-referencing).
        !           501: In addition,
        !           502: .IT awk
        !           503: allows
        !           504: parentheses for grouping, | for alternatives,
        !           505: .UL +
        !           506: for ``one or more'', and
        !           507: .UL ?
        !           508: for ``zero or one'',
        !           509: all as in
        !           510: .IT lex .
        !           511: Character classes
        !           512: may be abbreviated:
        !           513: .UL [a\-zA\-Z0\-9]
        !           514: is the set of all letters and digits.
        !           515: As an example,
        !           516: the
        !           517: .IT awk
        !           518: program
        !           519: .P1
        !           520: /[Aa]ho\||[Ww]einberger\||[Kk]ernighan/
        !           521: .P2
        !           522: will print all lines which contain any of the names
        !           523: ``Aho,'' ``Weinberger'' or ``Kernighan,''
        !           524: whether capitalized or not.
        !           525: .PP
        !           526: Regular expressions
        !           527: (with the extensions listed above)
        !           528: must be enclosed in slashes,
        !           529: just as in
        !           530: .IT ed
        !           531: and
        !           532: .IT sed .
        !           533: Within a regular expression,
        !           534: blanks and the regular expression
        !           535: metacharacters are significant.
        !           536: To turn of the magic meaning
        !           537: of one of the regular expression characters,
        !           538: precede it with a backslash.
        !           539: An example is the pattern
        !           540: .P1
        !           541: /\|\e/\^.\^*\e//
        !           542: .P2
        !           543: which matches any string of characters
        !           544: enclosed in slashes.
        !           545: .PP
        !           546: One can also specify that any field or variable
        !           547: matches
        !           548: a regular expression (or does not match it) with the operators
        !           549: .UL ~
        !           550: and
        !           551: .UL !~ .
        !           552: The program
        !           553: .P1
        !           554: $1 ~ /[jJ]ohn/
        !           555: .P2
        !           556: prints all lines where the first field matches ``john'' or ``John.''
        !           557: Notice that this will also match ``Johnson'', ``St. Johnsbury'', and so on.
        !           558: To restrict it to exactly
        !           559: .UL [jJ]ohn ,
        !           560: use
        !           561: .P1
        !           562: $1 ~ /^[jJ]ohn$/
        !           563: .P2
        !           564: The caret ^ refers to the beginning
        !           565: of a line or field;
        !           566: the dollar sign
        !           567: .UL $
        !           568: refers to the end.
        !           569: .NH 2
        !           570: Relational Expressions
        !           571: .PP
        !           572: An
        !           573: .IT awk
        !           574: pattern can be a relational expression
        !           575: involving the usual relational operators
        !           576: .UL < ,
        !           577: .UL <= ,
        !           578: .UL == ,
        !           579: .UL != ,
        !           580: .UL >= ,
        !           581: and
        !           582: .UL > .
        !           583: An example is
        !           584: .P1
        !           585: $2 > $1 + 100
        !           586: .P2
        !           587: which selects lines where the second field
        !           588: is at least 100 greater than the first field.
        !           589: Similarly,
        !           590: .P1
        !           591: NF % 2 == 0
        !           592: .P2
        !           593: prints lines with an even number of fields.
        !           594: .PP
        !           595: In relational tests, if neither operand is numeric,
        !           596: a string comparison is made;
        !           597: otherwise it is numeric.
        !           598: Thus,
        !           599: .P1
        !           600: $1 >= "s"
        !           601: .P2
        !           602: selects lines that begin with an
        !           603: .UL s ,
        !           604: .UL t ,
        !           605: .UL u ,
        !           606: etc.
        !           607: In the absence of any other information,
        !           608: fields are treated as strings, so
        !           609: the program
        !           610: .P1
        !           611: $1 > $2
        !           612: .P2
        !           613: will perform a string comparison.
        !           614: .NH 2
        !           615: Combinations of Patterns
        !           616: .PP
        !           617: A pattern can be any boolean combination of patterns,
        !           618: using the operators
        !           619: .UL \||\||
        !           620: (or),
        !           621: .UL &&
        !           622: (and), and
        !           623: .UL !
        !           624: (not).
        !           625: For example,
        !           626: .P1
        !           627: $1 >= "s" && $1 < "t" && $1 != "smith"
        !           628: .P2
        !           629: selects lines where the first field begins with ``s'', but is not ``smith''.
        !           630: .UL &&
        !           631: and
        !           632: .UL \||\||
        !           633: guarantee that their operands
        !           634: will be evaluated
        !           635: from left to right;
        !           636: evaluation stops as soon as the truth or falsehood
        !           637: is determined.
        !           638: .NH 2
        !           639: Pattern Ranges
        !           640: .PP
        !           641: The ``pattern'' that selects an action may also
        !           642: consist of two patterns separated by a comma, as in
        !           643: .P1
        !           644: pat1, pat2     { ... }
        !           645: .P2
        !           646: In this case, the action is performed for each line between
        !           647: an occurrence of
        !           648: .UL pat1
        !           649: and the next occurrence of
        !           650: .UL pat2
        !           651: (inclusive).
        !           652: For example,
        !           653: .P1
        !           654: /start/, /stop/
        !           655: .P2
        !           656: prints all lines between
        !           657: .UL start
        !           658: and
        !           659: .UL stop ,
        !           660: while
        !           661: .P1
        !           662: NR == 100, NR == 200 { ... }
        !           663: .P2
        !           664: does the action for lines 100 through 200
        !           665: of the input.
        !           666: .NH 1
        !           667: Actions
        !           668: .PP
        !           669: An
        !           670: .IT awk
        !           671: action is a sequence of action statements
        !           672: terminated by newlines or semicolons.
        !           673: These action statements can be used to do a variety of
        !           674: bookkeeping and string manipulating tasks.
        !           675: .NH 2
        !           676: Built-in Functions
        !           677: .PP
        !           678: .IT Awk
        !           679: provides a ``length'' function
        !           680: to compute the length of a string of characters.
        !           681: This program prints each record,
        !           682: preceded by its length:
        !           683: .P1
        !           684: {print length, $0}
        !           685: .P2
        !           686: .UL length
        !           687: by itself is a ``pseudo-variable'' which
        !           688: yields the length of the current record;
        !           689: .UL length(argument)
        !           690: is a function which yields the length of its argument,
        !           691: as in
        !           692: the equivalent
        !           693: .P1
        !           694: {print length($0), $0}
        !           695: .P2
        !           696: The argument may be any expression.
        !           697: .PP
        !           698: .IT Awk
        !           699: also
        !           700: provides the arithmetic functions
        !           701: .UL sqrt ,
        !           702: .UL log ,
        !           703: .UL exp ,
        !           704: and
        !           705: .UL int ,
        !           706: for
        !           707: square root,
        !           708: base
        !           709: .IT e
        !           710: logarithm,
        !           711: exponential,
        !           712: and integer part of their respective arguments.
        !           713: .PP
        !           714: The name of one of these built-in functions,
        !           715: without argument or parentheses,
        !           716: stands for the value of the function on the
        !           717: whole record.
        !           718: The program
        !           719: .P1
        !           720: length < 10 || length > 20
        !           721: .P2
        !           722: prints lines whose length
        !           723: is less than 10 or greater
        !           724: than 20.
        !           725: .PP
        !           726: The function
        !           727: .UL substr(s,\ m,\ n)
        !           728: produces the substring of
        !           729: .UL s
        !           730: that begins at position
        !           731: .UL m
        !           732: (origin 1)
        !           733: and is at most
        !           734: .UL n
        !           735: characters long.
        !           736: If
        !           737: .UL n
        !           738: is omitted, the substring goes to the end of
        !           739: .UL s .
        !           740: The function
        !           741: .UL index(s1,\ s2)
        !           742: returns the position where the string
        !           743: .UL s2
        !           744: occurs in
        !           745: .UL s1 ,
        !           746: or zero if it does not.
        !           747: .PP
        !           748: The function
        !           749: .UL sprintf(f,\ e1,\ e2,\ ...)
        !           750: produces the value of the expressions
        !           751: .UL e1 ,
        !           752: .UL e2 ,
        !           753: etc.,
        !           754: in the
        !           755: .UL printf
        !           756: format specified by
        !           757: .UL f .
        !           758: Thus, for example,
        !           759: .P1
        !           760: x = sprintf("%8.2f %10ld", $1, $2)
        !           761: .P2
        !           762: sets
        !           763: .UL x
        !           764: to the string produced by formatting
        !           765: the values of
        !           766: .UL $1
        !           767: and
        !           768: .UL $2 .
        !           769: .NH 2
        !           770: Variables, Expressions, and Assignments
        !           771: .PP
        !           772: .IT Awk
        !           773: variables take on numeric (floating point)
        !           774: or string values according to context.
        !           775: For example, in
        !           776: .P1
        !           777: x = 1
        !           778: .P2
        !           779: .UL x
        !           780: is clearly a number, while in
        !           781: .P1
        !           782: x = "smith"
        !           783: .P2
        !           784: it is clearly a string.
        !           785: Strings are converted to numbers and
        !           786: vice versa whenever context demands it.
        !           787: For instance,
        !           788: .P1
        !           789: x = "3" + "4"
        !           790: .P2
        !           791: assigns 7 to
        !           792: .UL x .
        !           793: Strings which cannot be interpreted
        !           794: as numbers in a numerical context
        !           795: will generally have numeric value zero,
        !           796: but it is unwise to count on this behavior.
        !           797: .PP
        !           798: By default, variables (other than built-ins) are initialized to the null string,
        !           799: which has numerical value zero;
        !           800: this eliminates the need for most
        !           801: .UL BEGIN
        !           802: sections.
        !           803: For example, the sums of the first two fields can be computed by
        !           804: .P1
        !           805:        { s1 += $1; s2 += $2 }
        !           806: END    { print s1, s2 }
        !           807: .P2
        !           808: .PP
        !           809: Arithmetic is done internally in floating point.
        !           810: The arithmetic operators are
        !           811: .UL + ,
        !           812: .UL \- ,
        !           813: .UL \(** ,
        !           814: .UL / ,
        !           815: and
        !           816: .UL %
        !           817: (mod).
        !           818: The C increment
        !           819: .UL ++
        !           820: and
        !           821: decrement
        !           822: .UL \-\-
        !           823: operators are also available,
        !           824: and so are the assignment operators
        !           825: .UL += ,
        !           826: .UL \-= ,
        !           827: .UL *= ,
        !           828: .UL /= ,
        !           829: and
        !           830: .UL %= .
        !           831: These operators may all be used in expressions.
        !           832: .NH 2
        !           833: Field Variables
        !           834: .PP
        !           835: Fields in
        !           836: .IT awk
        !           837: share essentially all of the properties of variables _
        !           838: they may be used in arithmetic or string operations,
        !           839: and may be assigned to.
        !           840: Thus one can
        !           841: replace the first field with a sequence number like this:
        !           842: .P1
        !           843: { $1 = NR; print }
        !           844: .P2
        !           845: or
        !           846: accumulate two fields into a third, like this:
        !           847: .P1
        !           848: { $1 = $2 + $3; print $0 }
        !           849: .P2
        !           850: or assign a string to a field:
        !           851: .P1
        !           852: { if ($3 > 1000)
        !           853:        $3 = "too big"
        !           854:   print
        !           855: }
        !           856: .P2
        !           857: which replaces the third field by ``too big'' when it is,
        !           858: and in any case prints the record.
        !           859: .PP
        !           860: Field references may be numerical expressions,
        !           861: as in
        !           862: .P1
        !           863: { print $i, $(i+1), $(i+n) }
        !           864: .P2
        !           865: Whether a field is deemed numeric or string depends on context;
        !           866: in ambiguous cases like
        !           867: .P1
        !           868: if ($1 == $2) ...
        !           869: .P2
        !           870: fields are treated as strings.
        !           871: .PP
        !           872: Each input line is split into fields automatically as necessary.
        !           873: It is also possible to split any variable or string
        !           874: into fields:
        !           875: .P1
        !           876: n = split(s, array, sep)
        !           877: .P2
        !           878: splits the
        !           879: the string
        !           880: .UL s
        !           881: into
        !           882: .UL array[1] ,
        !           883: \&...,
        !           884: .UL array[n] .
        !           885: The number of elements found is returned.
        !           886: If the
        !           887: .UL sep
        !           888: argument is provided, it is used as the field separator;
        !           889: otherwise
        !           890: .UL FS
        !           891: is used as the separator.
        !           892: .NH 2
        !           893: String Concatenation
        !           894: .PP
        !           895: Strings may be concatenated.
        !           896: For example
        !           897: .P1
        !           898: length($1 $2 $3)
        !           899: .P2
        !           900: returns the length of the first three fields.
        !           901: Or in a
        !           902: .UL print
        !           903: statement,
        !           904: .P1
        !           905: print $1 " is " $2
        !           906: .P2
        !           907: prints
        !           908: the two fields separated by `` is ''.
        !           909: Variables and numeric expressions may also appear in concatenations.
        !           910: .NH 2
        !           911: Arrays
        !           912: .PP
        !           913: Array elements are not declared;
        !           914: they spring into existence by being mentioned.
        !           915: Subscripts may have
        !           916: .ul
        !           917: any
        !           918: non-null
        !           919: value, including non-numeric strings.
        !           920: As an example of a conventional numeric subscript,
        !           921: the statement
        !           922: .P1
        !           923: x[NR] = $0
        !           924: .P2
        !           925: assigns the current input record to
        !           926: the
        !           927: .UL NR -th
        !           928: element of the array
        !           929: .UL x .
        !           930: In fact, it is possible in principle (though perhaps slow)
        !           931: to process the entire input in a random order with the
        !           932: .IT awk
        !           933: program
        !           934: .P1
        !           935:        { x[NR] = $0 }
        !           936: END    { \fI... program ...\fP }
        !           937: .P2
        !           938: The first action merely records each input line in
        !           939: the array
        !           940: .UL x .
        !           941: .PP
        !           942: Array elements may be named by non-numeric values,
        !           943: which gives
        !           944: .IT awk
        !           945: a capability rather like the associative memory of
        !           946: Snobol tables.
        !           947: Suppose the input contains fields with values like
        !           948: .UL apple ,
        !           949: .UL orange ,
        !           950: etc.
        !           951: Then the program
        !           952: .P1
        !           953: /apple/        { x["apple"]++ }
        !           954: /orange/       { x["orange"]++ }
        !           955: END            { print x["apple"], x["orange"] }
        !           956: .P2
        !           957: increments counts for the named array elements,
        !           958: and prints them at the end of the input.
        !           959: .NH 2
        !           960: Flow-of-Control Statements
        !           961: .PP
        !           962: .IT Awk
        !           963: provides the basic flow-of-control statements
        !           964: .UL if-else ,
        !           965: .UL while ,
        !           966: .UL for ,
        !           967: and statement grouping with braces, as in C.
        !           968: We showed the
        !           969: .UL if
        !           970: statement in section 3.3 without describing it.
        !           971: The condition in parentheses is evaluated;
        !           972: if it is true, the statement following the
        !           973: .UL if
        !           974: is done.
        !           975: The
        !           976: .UL else
        !           977: part is optional.
        !           978: .PP
        !           979: The
        !           980: .UL while
        !           981: statement is exactly like that of C.
        !           982: For example, to print all input fields one per line,
        !           983: .P1
        !           984: i = 1
        !           985: while (i <= NF) {
        !           986:        print $i
        !           987:        ++i
        !           988: }
        !           989: .P2
        !           990: .PP
        !           991: The
        !           992: .UL for
        !           993: statement is also exactly that of C:
        !           994: .P1
        !           995: for (i = 1; i <= NF; i++)
        !           996:        print $i
        !           997: .P2
        !           998: does the same job as the
        !           999: .UL while
        !          1000: statement above.
        !          1001: .PP
        !          1002: There is an alternate form of the
        !          1003: .UL for
        !          1004: statement which is suited for accessing the
        !          1005: elements of an associative array:
        !          1006: .P1
        !          1007: for (i in array)
        !          1008:        \fIstatement\f3
        !          1009: .P2
        !          1010: does
        !          1011: .ul
        !          1012: statement
        !          1013: with 
        !          1014: .UL i
        !          1015: set in turn to each element of
        !          1016: .UL array .
        !          1017: The elements are accessed in an apparently random order.
        !          1018: Chaos will ensue if 
        !          1019: .UL i
        !          1020: is altered, or if any new elements are
        !          1021: accessed during the loop.
        !          1022: .PP
        !          1023: The expression in the condition part of an
        !          1024: .UL if ,
        !          1025: .UL while
        !          1026: or
        !          1027: .UL for
        !          1028: can include relational operators like
        !          1029: .UL < ,
        !          1030: .UL <= ,
        !          1031: .UL > ,
        !          1032: .UL >= ,
        !          1033: .UL ==
        !          1034: (``is equal to''),
        !          1035: and
        !          1036: .UL !=
        !          1037: (``not equal to'');
        !          1038: regular expression matches with the match operators
        !          1039: .UL ~
        !          1040: and
        !          1041: .UL !~ ;
        !          1042: the logical operators
        !          1043: .UL \||\|| ,
        !          1044: .UL && ,
        !          1045: and
        !          1046: .UL ! ;
        !          1047: and of course parentheses for grouping.
        !          1048: .PP
        !          1049: The
        !          1050: .UL break
        !          1051: statement causes an immediate exit
        !          1052: from an enclosing
        !          1053: .UL while
        !          1054: or
        !          1055: .UL for ;
        !          1056: the
        !          1057: .UL continue
        !          1058: statement
        !          1059: causes the next iteration to begin.
        !          1060: .PP
        !          1061: The statement
        !          1062: .UL next
        !          1063: causes
        !          1064: .IT awk
        !          1065: to skip immediately to
        !          1066: the next record and begin scanning the patterns from the top.
        !          1067: The statement
        !          1068: .UL exit
        !          1069: causes the program to behave as if the end of the input
        !          1070: had occurred.
        !          1071: .PP
        !          1072: Comments may be placed in
        !          1073: .IT awk
        !          1074: programs:
        !          1075: they begin with the character
        !          1076: .UL #
        !          1077: and end with the end of the line,
        !          1078: as in
        !          1079: .P1
        !          1080: print x, y     # this is a comment
        !          1081: .P2
        !          1082: .NH
        !          1083: Design
        !          1084: .PP
        !          1085: The
        !          1086: .UX
        !          1087: system
        !          1088: already provides several programs that
        !          1089: operate by passing input through a
        !          1090: selection mechanism.
        !          1091: .IT Grep ,
        !          1092: the first and simplest, merely prints all lines which
        !          1093: match a single specified pattern.
        !          1094: .IT Egrep
        !          1095: provides more general patterns, i.e., regular expressions
        !          1096: in full generality;
        !          1097: .IT fgrep
        !          1098: searches for a set of keywords with a particularly fast algorithm.
        !          1099: .IT Sed\|
        !          1100: .[
        !          1101: unix programm manual
        !          1102: .]
        !          1103: provides most of the editing facilities of
        !          1104: the editor
        !          1105: .IT ed  ,
        !          1106: applied to a stream of input.
        !          1107: None of these programs provides
        !          1108: numeric capabilities,
        !          1109: logical relations,
        !          1110: or variables.
        !          1111: .PP
        !          1112: .IT Lex\|
        !          1113: .[
        !          1114: lesk lexical analyzer cstr
        !          1115: .]
        !          1116: provides general regular expression recognition capabilities,
        !          1117: and, by serving as a C program generator,
        !          1118: is essentially open-ended in its capabilities.
        !          1119: The use of
        !          1120: .IT lex ,
        !          1121: however, requires a knowledge of C programming,
        !          1122: and a
        !          1123: .IT lex
        !          1124: program must be compiled and loaded before use,
        !          1125: which discourages its use for one-shot applications.
        !          1126: .PP
        !          1127: .IT Awk
        !          1128: is an attempt
        !          1129: to fill in another part of the matrix of possibilities.
        !          1130: It
        !          1131: provides general regular expression capabilities
        !          1132: and an implicit input/output loop.
        !          1133: But it also provides convenient numeric processing,
        !          1134: variables,
        !          1135: more general selection,
        !          1136: and control flow in the actions.
        !          1137: It
        !          1138: does not require compilation or a knowledge of C.
        !          1139: Finally,
        !          1140: .IT awk
        !          1141: provides
        !          1142: a convenient way to access fields within lines;
        !          1143: it is unique in this respect.
        !          1144: .PP
        !          1145: .IT Awk
        !          1146: also tries to integrate strings and numbers
        !          1147: completely,
        !          1148: by treating all quantities as both string and numeric,
        !          1149: deciding which representation is appropriate
        !          1150: as late as possible.
        !          1151: In most cases the user can simply ignore the differences.
        !          1152: .PP
        !          1153: Most of the effort in developing
        !          1154: .I awk
        !          1155: went into deciding what
        !          1156: .I awk
        !          1157: should or should not do
        !          1158: (for instance, it doesn't do string substitution)
        !          1159: and what the syntax should be
        !          1160: (no explicit operator for concatenation)
        !          1161: rather
        !          1162: than on writing or debugging the code.
        !          1163: We have tried
        !          1164: to make the syntax powerful
        !          1165: but easy to use and well adapted
        !          1166: to scanning files.
        !          1167: For example,
        !          1168: the absence of declarations and implicit initializations,
        !          1169: while probably a bad idea for a general-purpose programming language,
        !          1170: is desirable in a language
        !          1171: that is meant to be used for tiny programs
        !          1172: that may even be composed on the command line.
        !          1173: .PP
        !          1174: In practice,
        !          1175: .IT awk
        !          1176: usage seems to fall into two broad categories.
        !          1177: One is what might be called ``report generation'' \(em
        !          1178: processing an input to extract counts,
        !          1179: sums, sub-totals, etc.
        !          1180: This also includes the writing of trivial
        !          1181: data validation programs,
        !          1182: such as verifying that a field contains only numeric information
        !          1183: or that certain delimiters are properly balanced.
        !          1184: The combination of textual and numeric processing is invaluable here.
        !          1185: .PP
        !          1186: A second area of use is as a data transformer,
        !          1187: converting data from the form produced by one program
        !          1188: into that expected by another.
        !          1189: The simplest examples merely select fields, perhaps with rearrangements.
        !          1190: .NH
        !          1191: Implementation
        !          1192: .PP
        !          1193: The actual implementation of
        !          1194: .IT awk
        !          1195: uses the language development tools available
        !          1196: on the
        !          1197: .UC UNIX
        !          1198: operating system.
        !          1199: The grammar is specified with
        !          1200: .IT yacc ;
        !          1201: .[
        !          1202: yacc johnson cstr
        !          1203: .]
        !          1204: the lexical analysis is done by
        !          1205: .IT lex ;
        !          1206: the regular expression recognizers are
        !          1207: deterministic finite automata
        !          1208: constructed directly from the expressions.
        !          1209: An
        !          1210: .IT awk
        !          1211: program is translated into a 
        !          1212: parse tree which is then directly executed
        !          1213: by a simple interpreter.
        !          1214: .PP
        !          1215: .IT Awk
        !          1216: was designed for ease of use rather than processing speed;
        !          1217: the delayed evaluation of variable types
        !          1218: and the necessity to break input
        !          1219: into fields makes high speed difficult to achieve in any case.
        !          1220: Nonetheless,
        !          1221: the program has not proven to be unworkably slow.
        !          1222: .PP
        !          1223: Table I below shows the execution (user + system) time
        !          1224: on a PDP-11/70 of
        !          1225: the
        !          1226: .UC UNIX
        !          1227: programs
        !          1228: .IT wc ,
        !          1229: .IT grep ,
        !          1230: .IT egrep ,
        !          1231: .IT fgrep ,
        !          1232: .IT sed ,
        !          1233: .IT lex ,
        !          1234: and
        !          1235: .IT awk
        !          1236: on the following simple tasks:
        !          1237: .IP "\ \ 1."
        !          1238: count the number of lines.
        !          1239: .IP "\ \ 2."
        !          1240: print all lines containing ``doug''.
        !          1241: .IP "\ \ 3."
        !          1242: print all lines containing ``doug'', ``ken'' or ``dmr''.
        !          1243: .IP "\ \ 4."
        !          1244: print the third field of each line.
        !          1245: .IP "\ \ 5."
        !          1246: print the third and second fields of each line, in that order.
        !          1247: .IP "\ \ 6."
        !          1248: append all lines containing ``doug'', ``ken'', and ``dmr''
        !          1249: to files ``jdoug'', ``jken'', and ``jdmr'', respectively.
        !          1250: .IP "\ \ 7."
        !          1251: print each line prefixed by ``line-number\ :\ ''.
        !          1252: .IP "\ \ 8."
        !          1253: sum the fourth column of a table.
        !          1254: .LP
        !          1255: The program
        !          1256: .IT wc
        !          1257: merely counts words, lines and characters in its input;
        !          1258: we have already mentioned the others.
        !          1259: In all cases the input was a file containing
        !          1260: 10,000 lines
        !          1261: as created by the
        !          1262: command
        !          1263: .IT "ls \-l" ;
        !          1264: each line has the form
        !          1265: .P1
        !          1266: -rw-rw-rw- 1 ava 123 Oct 15 17:05 xxx
        !          1267: .P2
        !          1268: The total length of this input is
        !          1269: 452,960 characters.
        !          1270: Times for
        !          1271: .IT lex
        !          1272: do not include compile or load.
        !          1273: .PP
        !          1274: As might be expected,
        !          1275: .IT awk
        !          1276: is not as fast as the specialized tools
        !          1277: .IT wc ,
        !          1278: .IT sed ,
        !          1279: or the programs in the
        !          1280: .IT grep
        !          1281: family,
        !          1282: but
        !          1283: is faster than the more general tool
        !          1284: .IT lex .
        !          1285: In all cases, the tasks were
        !          1286: about as easy to express as
        !          1287: .IT awk
        !          1288: programs
        !          1289: as programs in these other languages;
        !          1290: tasks involving fields were
        !          1291: considerably easier to express as
        !          1292: .IT awk
        !          1293: programs.
        !          1294: Some of the test programs are shown in
        !          1295: .IT awk ,
        !          1296: .IT sed
        !          1297: and
        !          1298: .IT lex .
        !          1299: .[
        !          1300: $LIST$
        !          1301: .]
        !          1302: .1C
        !          1303: .TS
        !          1304: center;
        !          1305: c c c c c c c c c
        !          1306: c c c c c c c c c
        !          1307: c|n|n|n|n|n|n|n|n|.
        !          1308:                                Task
        !          1309: Program        1       2       3       4       5       6       7       8
        !          1310: _
        !          1311: \fIwc\fR       8.6
        !          1312: \fIgrep\fR     11.7    13.1
        !          1313: \fIegrep\fR    6.2     11.5    11.6
        !          1314: \fIfgrep\fR    7.7     13.8    16.1
        !          1315: \fIsed\fR      10.2    11.6    15.8    29.0    30.5    16.1
        !          1316: \fIlex\fR      65.1    150.1   144.2   67.7    70.3    104.0   81.7    92.8
        !          1317: \fIawk\fR      15.0    25.6    29.9    33.3    38.9    46.4    71.4    31.1
        !          1318: _
        !          1319: .TE
        !          1320: .sp
        !          1321: .ce
        !          1322: \fBTable I.\fR  Execution Times of Programs. (Times are in sec.)
        !          1323: .sp 2
        !          1324: .2C
        !          1325: .PP
        !          1326: The programs for some of these jobs are shown below.
        !          1327: The
        !          1328: .IT lex
        !          1329: programs are generally too long to show.
        !          1330: .LP
        !          1331: AWK:
        !          1332: .LP
        !          1333: .P1
        !          1334: 1.     END     {print NR}
        !          1335: .P2
        !          1336: .P1
        !          1337: 2.     /doug/
        !          1338: .P2
        !          1339: .P1
        !          1340: 3.     /ken|doug|dmr/
        !          1341: .P2
        !          1342: .P1
        !          1343: 4.     {print $3}
        !          1344: .P2
        !          1345: .P1
        !          1346: 5.     {print $3, $2}
        !          1347: .P2
        !          1348: .P1
        !          1349: 6.     /ken/   {print >"jken"}
        !          1350:        /doug/  {print >"jdoug"}
        !          1351:        /dmr/   {print >"jdmr"}
        !          1352: .P2
        !          1353: .P1
        !          1354: 7.     {print NR ": " $0}
        !          1355: .P2
        !          1356: .P1
        !          1357: 8.             {sum = sum + $4}
        !          1358:        END     {print sum}
        !          1359: .P2
        !          1360: .LP
        !          1361: SED:
        !          1362: .LP
        !          1363: .P1
        !          1364: 1.     $=
        !          1365: .P2
        !          1366: .P1
        !          1367: 2.     /doug/p
        !          1368: .P2
        !          1369: .P1
        !          1370: 3.     /doug/p
        !          1371:        /doug/d
        !          1372:        /ken/p
        !          1373:        /ken/d
        !          1374:        /dmr/p
        !          1375:        /dmr/d
        !          1376: .P2
        !          1377: .P1
        !          1378: 4.     /[^ ]* [ ]*[^ ]* [ ]*\e([^ ]*\e) .*/s//\e1/p
        !          1379: .P2
        !          1380: .P1
        !          1381: 5.     /[^ ]* [ ]*\e([^ ]*\e) [ ]*\e([^ ]*\e) .*/s//\e2 \e1/p
        !          1382: .P2
        !          1383: .P1
        !          1384: 6.     /ken/w jken
        !          1385:        /doug/w jdoug
        !          1386:        /dmr/w jdmr
        !          1387: .P2
        !          1388: .LP
        !          1389: LEX:
        !          1390: .LP
        !          1391: .P1
        !          1392: 1.     %{
        !          1393:        int i;
        !          1394:        %}
        !          1395:        %%
        !          1396:        \en     i++;
        !          1397:        .       ;
        !          1398:        %%
        !          1399:        yywrap() {
        !          1400:                printf("%d\en", i);
        !          1401:        }
        !          1402: .P2
        !          1403: .P1
        !          1404: 2.     %%
        !          1405:        ^.*doug.*$      printf("%s\en", yytext);
        !          1406:        .       ;
        !          1407:        \en     ;
        !          1408: .P2

unix.superglobalmegacorp.com

This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.