|
|
1.1 root 1: /*
2: * Copyright (c) 1982, 1986, 1988, 1990 Regents of the University of California.
3: * All rights reserved.
4: *
5: * Redistribution is only permitted until one year after the first shipment
6: * of 4.4BSD by the Regents. Otherwise, redistribution and use in source and
7: * binary forms are permitted provided that: (1) source distributions retain
8: * this entire copyright notice and comment, and (2) distributions including
9: * binaries display the following acknowledgement: This product includes
10: * software developed by the University of California, Berkeley and its
11: * contributors'' in the documentation or other materials provided with the
12: * distribution and in all advertising materials mentioning features or use
13: * of this software. Neither the name of the University nor the names of
14: * its contributors may be used to endorse or promote products derived from
15: * this software without specific prior written permission.
16: * THIS SOFTWARE IS PROVIDED AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
17: * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
18: * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
19: *
20: * @(#)tcp_input.c 7.25 (Berkeley) 6/30/90
21: */
22:
23: #include "param.h"
24: #include "systm.h"
25: #include "malloc.h"
26: #include "mbuf.h"
27: #include "protosw.h"
28: #include "socket.h"
29: #include "socketvar.h"
30: #include "errno.h"
31:
32: #include "../net/if.h"
33: #include "../net/route.h"
34:
35: #include "in.h"
36: #include "in_systm.h"
37: #include "ip.h"
38: #include "in_pcb.h"
39: #include "ip_var.h"
40: #include "tcp.h"
41: #include "tcp_fsm.h"
42: #include "tcp_seq.h"
43: #include "tcp_timer.h"
44: #include "tcp_var.h"
45: #include "tcpip.h"
46: #include "tcp_debug.h"
47:
48: int tcprexmtthresh = 3;
49: int tcppredack; /* XXX debugging: times hdr predict ok for acks */
50: int tcppreddat; /* XXX # times header prediction ok for data packets */
51: int tcppcbcachemiss;
52: struct tcpiphdr tcp_saveti;
53: struct inpcb *tcp_last_inpcb = &tcb;
54:
55: struct tcpcb *tcp_newtcpcb();
56:
57: /*
58: * Insert segment ti into reassembly queue of tcp with
59: * control block tp. Return TH_FIN if reassembly now includes
60: * a segment with FIN. The macro form does the common case inline
61: * (segment is the next to be received on an established connection,
62: * and the queue is empty), avoiding linkage into and removal
63: * from the queue and repetition of various conversions.
64: * Set DELACK for segments received in order, but ack immediately
65: * when segments are out of order (so fast retransmit can work).
66: */
67: #define TCP_REASS(tp, ti, m, so, flags) { \
68: if ((ti)->ti_seq == (tp)->rcv_nxt && \
69: (tp)->seg_next == (struct tcpiphdr *)(tp) && \
70: (tp)->t_state == TCPS_ESTABLISHED) { \
71: tp->t_flags |= TF_DELACK; \
72: (tp)->rcv_nxt += (ti)->ti_len; \
73: flags = (ti)->ti_flags & TH_FIN; \
74: tcpstat.tcps_rcvpack++;\
75: tcpstat.tcps_rcvbyte += (ti)->ti_len;\
76: sbappend(&(so)->so_rcv, (m)); \
77: sorwakeup(so); \
78: } else { \
79: (flags) = tcp_reass((tp), (ti), (m)); \
80: tp->t_flags |= TF_ACKNOW; \
81: } \
82: }
83:
84: tcp_reass(tp, ti, m)
85: register struct tcpcb *tp;
86: register struct tcpiphdr *ti;
87: struct mbuf *m;
88: {
89: register struct tcpiphdr *q;
90: struct socket *so = tp->t_inpcb->inp_socket;
91: int flags;
92:
93: /*
94: * Call with ti==0 after become established to
95: * force pre-ESTABLISHED data up to user socket.
96: */
97: if (ti == 0)
98: goto present;
99:
100: /*
101: * Find a segment which begins after this one does.
102: */
103: for (q = tp->seg_next; q != (struct tcpiphdr *)tp;
104: q = (struct tcpiphdr *)q->ti_next)
105: if (SEQ_GT(q->ti_seq, ti->ti_seq))
106: break;
107:
108: /*
109: * If there is a preceding segment, it may provide some of
110: * our data already. If so, drop the data from the incoming
111: * segment. If it provides all of our data, drop us.
112: */
113: if ((struct tcpiphdr *)q->ti_prev != (struct tcpiphdr *)tp) {
114: register int i;
115: q = (struct tcpiphdr *)q->ti_prev;
116: /* conversion to int (in i) handles seq wraparound */
117: i = q->ti_seq + q->ti_len - ti->ti_seq;
118: if (i > 0) {
119: if (i >= ti->ti_len) {
120: tcpstat.tcps_rcvduppack++;
121: tcpstat.tcps_rcvdupbyte += ti->ti_len;
122: m_freem(m);
123: return (0);
124: }
125: m_adj(m, i);
126: ti->ti_len -= i;
127: ti->ti_seq += i;
128: }
129: q = (struct tcpiphdr *)(q->ti_next);
130: }
131: tcpstat.tcps_rcvoopack++;
132: tcpstat.tcps_rcvoobyte += ti->ti_len;
133: REASS_MBUF(ti) = m; /* XXX */
134:
135: /*
136: * While we overlap succeeding segments trim them or,
137: * if they are completely covered, dequeue them.
138: */
139: while (q != (struct tcpiphdr *)tp) {
140: register int i = (ti->ti_seq + ti->ti_len) - q->ti_seq;
141: if (i <= 0)
142: break;
143: if (i < q->ti_len) {
144: q->ti_seq += i;
145: q->ti_len -= i;
146: m_adj(REASS_MBUF(q), i);
147: break;
148: }
149: q = (struct tcpiphdr *)q->ti_next;
150: m = REASS_MBUF((struct tcpiphdr *)q->ti_prev);
151: remque(q->ti_prev);
152: m_freem(m);
153: }
154:
155: /*
156: * Stick new segment in its place.
157: */
158: insque(ti, q->ti_prev);
159:
160: present:
161: /*
162: * Present data to user, advancing rcv_nxt through
163: * completed sequence space.
164: */
165: if (TCPS_HAVERCVDSYN(tp->t_state) == 0)
166: return (0);
167: ti = tp->seg_next;
168: if (ti == (struct tcpiphdr *)tp || ti->ti_seq != tp->rcv_nxt)
169: return (0);
170: if (tp->t_state == TCPS_SYN_RECEIVED && ti->ti_len)
171: return (0);
172: do {
173: tp->rcv_nxt += ti->ti_len;
174: flags = ti->ti_flags & TH_FIN;
175: remque(ti);
176: m = REASS_MBUF(ti);
177: ti = (struct tcpiphdr *)ti->ti_next;
178: if (so->so_state & SS_CANTRCVMORE)
179: m_freem(m);
180: else
181: sbappend(&so->so_rcv, m);
182: } while (ti != (struct tcpiphdr *)tp && ti->ti_seq == tp->rcv_nxt);
183: sorwakeup(so);
184: return (flags);
185: }
186:
187: /*
188: * TCP input routine, follows pages 65-76 of the
189: * protocol specification dated September, 1981 very closely.
190: */
191: tcp_input(m, iphlen)
192: register struct mbuf *m;
193: int iphlen;
194: {
195: register struct tcpiphdr *ti;
196: register struct inpcb *inp;
197: struct mbuf *om = 0;
198: int len, tlen, off;
199: register struct tcpcb *tp = 0;
200: register int tiflags;
201: struct socket *so;
202: int todrop, acked, ourfinisacked, needoutput = 0;
203: short ostate;
204: struct in_addr laddr;
205: int dropsocket = 0;
206: int iss = 0;
207:
208: tcpstat.tcps_rcvtotal++;
209: /*
210: * Get IP and TCP header together in first mbuf.
211: * Note: IP leaves IP header in first mbuf.
212: */
213: ti = mtod(m, struct tcpiphdr *);
214: if (iphlen > sizeof (struct ip))
215: ip_stripoptions(m, (struct mbuf *)0);
216: if (m->m_len < sizeof (struct tcpiphdr)) {
217: if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) {
218: tcpstat.tcps_rcvshort++;
219: return;
220: }
221: ti = mtod(m, struct tcpiphdr *);
222: }
223:
224: /*
225: * Checksum extended TCP header and data.
226: */
227: tlen = ((struct ip *)ti)->ip_len;
228: len = sizeof (struct ip) + tlen;
229: ti->ti_next = ti->ti_prev = 0;
230: ti->ti_x1 = 0;
231: ti->ti_len = (u_short)tlen;
232: HTONS(ti->ti_len);
233: if (ti->ti_sum = in_cksum(m, len)) {
234: tcpstat.tcps_rcvbadsum++;
235: goto drop;
236: }
237:
238: /*
239: * Check that TCP offset makes sense,
240: * pull out TCP options and adjust length. XXX
241: */
242: off = ti->ti_off << 2;
243: if (off < sizeof (struct tcphdr) || off > tlen) {
244: tcpstat.tcps_rcvbadoff++;
245: goto drop;
246: }
247: tlen -= off;
248: ti->ti_len = tlen;
249: if (off > sizeof (struct tcphdr)) {
250: if (m->m_len < sizeof(struct ip) + off) {
251: if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) {
252: tcpstat.tcps_rcvshort++;
253: return;
254: }
255: ti = mtod(m, struct tcpiphdr *);
256: }
257: om = m_get(M_DONTWAIT, MT_DATA);
258: if (om == 0)
259: goto drop;
260: om->m_len = off - sizeof (struct tcphdr);
261: { caddr_t op = mtod(m, caddr_t) + sizeof (struct tcpiphdr);
262: bcopy(op, mtod(om, caddr_t), (unsigned)om->m_len);
263: m->m_len -= om->m_len;
264: m->m_pkthdr.len -= om->m_len;
265: bcopy(op+om->m_len, op,
266: (unsigned)(m->m_len-sizeof (struct tcpiphdr)));
267: }
268: }
269: tiflags = ti->ti_flags;
270:
271: /*
272: * Convert TCP protocol specific fields to host format.
273: */
274: NTOHL(ti->ti_seq);
275: NTOHL(ti->ti_ack);
276: NTOHS(ti->ti_win);
277: NTOHS(ti->ti_urp);
278:
279: /*
280: * Locate pcb for segment.
281: */
282: findpcb:
283: inp = tcp_last_inpcb;
284: if (inp->inp_lport != ti->ti_dport ||
285: inp->inp_fport != ti->ti_sport ||
286: inp->inp_faddr.s_addr != ti->ti_src.s_addr ||
287: inp->inp_laddr.s_addr != ti->ti_dst.s_addr) {
288: inp = in_pcblookup(&tcb, ti->ti_src, ti->ti_sport,
289: ti->ti_dst, ti->ti_dport, INPLOOKUP_WILDCARD);
290: if (inp)
291: tcp_last_inpcb = inp;
292: ++tcppcbcachemiss;
293: }
294:
295: /*
296: * If the state is CLOSED (i.e., TCB does not exist) then
297: * all data in the incoming segment is discarded.
298: * If the TCB exists but is in CLOSED state, it is embryonic,
299: * but should either do a listen or a connect soon.
300: */
301: if (inp == 0)
302: goto dropwithreset;
303: tp = intotcpcb(inp);
304: if (tp == 0)
305: goto dropwithreset;
306: if (tp->t_state == TCPS_CLOSED)
307: goto drop;
308: so = inp->inp_socket;
309: if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
310: if (so->so_options & SO_DEBUG) {
311: ostate = tp->t_state;
312: tcp_saveti = *ti;
313: }
314: if (so->so_options & SO_ACCEPTCONN) {
315: so = sonewconn(so, 0);
316: if (so == 0)
317: goto drop;
318: /*
319: * This is ugly, but ....
320: *
321: * Mark socket as temporary until we're
322: * committed to keeping it. The code at
323: * ``drop'' and ``dropwithreset'' check the
324: * flag dropsocket to see if the temporary
325: * socket created here should be discarded.
326: * We mark the socket as discardable until
327: * we're committed to it below in TCPS_LISTEN.
328: */
329: dropsocket++;
330: inp = (struct inpcb *)so->so_pcb;
331: inp->inp_laddr = ti->ti_dst;
332: inp->inp_lport = ti->ti_dport;
333: #if BSD>=43
334: inp->inp_options = ip_srcroute();
335: #endif
336: tp = intotcpcb(inp);
337: tp->t_state = TCPS_LISTEN;
338: }
339: }
340:
341: /*
342: * Segment received on connection.
343: * Reset idle time and keep-alive timer.
344: */
345: tp->t_idle = 0;
346: tp->t_timer[TCPT_KEEP] = tcp_keepidle;
347:
348: /*
349: * Process options if not in LISTEN state,
350: * else do it below (after getting remote address).
351: */
352: if (om && tp->t_state != TCPS_LISTEN) {
353: tcp_dooptions(tp, om, ti);
354: om = 0;
355: }
356: /*
357: * Header prediction: check for the two common cases
358: * of a uni-directional data xfer. If the packet has
359: * no control flags, is in-sequence, the window didn't
360: * change and we're not retransmitting, it's a
361: * candidate. If the length is zero and the ack moved
362: * forward, we're the sender side of the xfer. Just
363: * free the data acked & wake any higher level process
364: * that was blocked waiting for space. If the length
365: * is non-zero and the ack didn't move, we're the
366: * receiver side. If we're getting packets in-order
367: * (the reassembly queue is empty), add the data to
368: * the socket buffer and note that we need a delayed ack.
369: */
370: if (tp->t_state == TCPS_ESTABLISHED &&
371: (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
372: ti->ti_seq == tp->rcv_nxt &&
373: ti->ti_win && ti->ti_win == tp->snd_wnd &&
374: tp->snd_nxt == tp->snd_max) {
375: if (ti->ti_len == 0) {
376: if (SEQ_GT(ti->ti_ack, tp->snd_una) &&
377: SEQ_LEQ(ti->ti_ack, tp->snd_max) &&
378: tp->snd_cwnd >= tp->snd_wnd) {
379: /*
380: * this is a pure ack for outstanding data.
381: */
382: ++tcppredack;
383: if (tp->t_rtt && SEQ_GT(ti->ti_ack,tp->t_rtseq))
384: tcp_xmit_timer(tp);
385: acked = ti->ti_ack - tp->snd_una;
386: tcpstat.tcps_rcvackpack++;
387: tcpstat.tcps_rcvackbyte += acked;
388: sbdrop(&so->so_snd, acked);
389: tp->snd_una = ti->ti_ack;
390: m_freem(m);
391:
392: /*
393: * If all outstanding data are acked, stop
394: * retransmit timer, otherwise restart timer
395: * using current (possibly backed-off) value.
396: * If process is waiting for space,
397: * wakeup/selwakeup/signal. If data
398: * are ready to send, let tcp_output
399: * decide between more output or persist.
400: */
401: if (tp->snd_una == tp->snd_max)
402: tp->t_timer[TCPT_REXMT] = 0;
403: else if (tp->t_timer[TCPT_PERSIST] == 0)
404: tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
405:
406: if (so->so_snd.sb_flags & SB_NOTIFY)
407: sowwakeup(so);
408: if (so->so_snd.sb_cc)
409: (void) tcp_output(tp);
410: return;
411: }
412: } else if (ti->ti_ack == tp->snd_una &&
413: tp->seg_next == (struct tcpiphdr *)tp &&
414: ti->ti_len <= sbspace(&so->so_rcv)) {
415: /*
416: * this is a pure, in-sequence data packet
417: * with nothing on the reassembly queue and
418: * we have enough buffer space to take it.
419: */
420: ++tcppreddat;
421: tp->rcv_nxt += ti->ti_len;
422: tcpstat.tcps_rcvpack++;
423: tcpstat.tcps_rcvbyte += ti->ti_len;
424: /*
425: * Drop TCP and IP headers then add data
426: * to socket buffer
427: */
428: m->m_data += sizeof(struct tcpiphdr);
429: m->m_len -= sizeof(struct tcpiphdr);
430: sbappend(&so->so_rcv, m);
431: sorwakeup(so);
432: tp->t_flags |= TF_DELACK;
433: return;
434: }
435: }
436:
437: /*
438: * Drop TCP and IP headers; TCP options were dropped above.
439: */
440: m->m_data += sizeof(struct tcpiphdr);
441: m->m_len -= sizeof(struct tcpiphdr);
442:
443: /*
444: * Calculate amount of space in receive window,
445: * and then do TCP input processing.
446: * Receive window is amount of space in rcv queue,
447: * but not less than advertised window.
448: */
449: { int win;
450:
451: win = sbspace(&so->so_rcv);
452: if (win < 0)
453: win = 0;
454: tp->rcv_wnd = max(win, (int)(tp->rcv_adv - tp->rcv_nxt));
455: }
456:
457: switch (tp->t_state) {
458:
459: /*
460: * If the state is LISTEN then ignore segment if it contains an RST.
461: * If the segment contains an ACK then it is bad and send a RST.
462: * If it does not contain a SYN then it is not interesting; drop it.
463: * Don't bother responding if the destination was a broadcast.
464: * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial
465: * tp->iss, and send a segment:
466: * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
467: * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
468: * Fill in remote peer address fields if not previously specified.
469: * Enter SYN_RECEIVED state, and process any other fields of this
470: * segment in this state.
471: */
472: case TCPS_LISTEN: {
473: struct mbuf *am;
474: register struct sockaddr_in *sin;
475:
476: if (tiflags & TH_RST)
477: goto drop;
478: if (tiflags & TH_ACK)
479: goto dropwithreset;
480: if ((tiflags & TH_SYN) == 0)
481: goto drop;
482: if (m->m_flags & M_BCAST)
483: goto drop;
484: am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */
485: if (am == NULL)
486: goto drop;
487: am->m_len = sizeof (struct sockaddr_in);
488: sin = mtod(am, struct sockaddr_in *);
489: sin->sin_family = AF_INET;
490: sin->sin_len = sizeof(*sin);
491: sin->sin_addr = ti->ti_src;
492: sin->sin_port = ti->ti_sport;
493: laddr = inp->inp_laddr;
494: if (inp->inp_laddr.s_addr == INADDR_ANY)
495: inp->inp_laddr = ti->ti_dst;
496: if (in_pcbconnect(inp, am)) {
497: inp->inp_laddr = laddr;
498: (void) m_free(am);
499: goto drop;
500: }
501: (void) m_free(am);
502: tp->t_template = tcp_template(tp);
503: if (tp->t_template == 0) {
504: tp = tcp_drop(tp, ENOBUFS);
505: dropsocket = 0; /* socket is already gone */
506: goto drop;
507: }
508: if (om) {
509: tcp_dooptions(tp, om, ti);
510: om = 0;
511: }
512: if (iss)
513: tp->iss = iss;
514: else
515: tp->iss = tcp_iss;
516: tcp_iss += TCP_ISSINCR/2;
517: tp->irs = ti->ti_seq;
518: tcp_sendseqinit(tp);
519: tcp_rcvseqinit(tp);
520: tp->t_flags |= TF_ACKNOW;
521: tp->t_state = TCPS_SYN_RECEIVED;
522: tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
523: dropsocket = 0; /* committed to socket */
524: tcpstat.tcps_accepts++;
525: goto trimthenstep6;
526: }
527:
528: /*
529: * If the state is SYN_SENT:
530: * if seg contains an ACK, but not for our SYN, drop the input.
531: * if seg contains a RST, then drop the connection.
532: * if seg does not contain SYN, then drop it.
533: * Otherwise this is an acceptable SYN segment
534: * initialize tp->rcv_nxt and tp->irs
535: * if seg contains ack then advance tp->snd_una
536: * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
537: * arrange for segment to be acked (eventually)
538: * continue processing rest of data/controls, beginning with URG
539: */
540: case TCPS_SYN_SENT:
541: if ((tiflags & TH_ACK) &&
542: (SEQ_LEQ(ti->ti_ack, tp->iss) ||
543: SEQ_GT(ti->ti_ack, tp->snd_max)))
544: goto dropwithreset;
545: if (tiflags & TH_RST) {
546: if (tiflags & TH_ACK)
547: tp = tcp_drop(tp, ECONNREFUSED);
548: goto drop;
549: }
550: if ((tiflags & TH_SYN) == 0)
551: goto drop;
552: if (tiflags & TH_ACK) {
553: tp->snd_una = ti->ti_ack;
554: if (SEQ_LT(tp->snd_nxt, tp->snd_una))
555: tp->snd_nxt = tp->snd_una;
556: }
557: tp->t_timer[TCPT_REXMT] = 0;
558: tp->irs = ti->ti_seq;
559: tcp_rcvseqinit(tp);
560: tp->t_flags |= TF_ACKNOW;
561: if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) {
562: tcpstat.tcps_connects++;
563: soisconnected(so);
564: tp->t_state = TCPS_ESTABLISHED;
565: (void) tcp_reass(tp, (struct tcpiphdr *)0,
566: (struct mbuf *)0);
567: /*
568: * if we didn't have to retransmit the SYN,
569: * use its rtt as our initial srtt & rtt var.
570: */
571: if (tp->t_rtt)
572: tcp_xmit_timer(tp);
573: } else
574: tp->t_state = TCPS_SYN_RECEIVED;
575:
576: trimthenstep6:
577: /*
578: * Advance ti->ti_seq to correspond to first data byte.
579: * If data, trim to stay within window,
580: * dropping FIN if necessary.
581: */
582: ti->ti_seq++;
583: if (ti->ti_len > tp->rcv_wnd) {
584: todrop = ti->ti_len - tp->rcv_wnd;
585: m_adj(m, -todrop);
586: ti->ti_len = tp->rcv_wnd;
587: tiflags &= ~TH_FIN;
588: tcpstat.tcps_rcvpackafterwin++;
589: tcpstat.tcps_rcvbyteafterwin += todrop;
590: }
591: tp->snd_wl1 = ti->ti_seq - 1;
592: tp->rcv_up = ti->ti_seq;
593: goto step6;
594: }
595:
596: /*
597: * States other than LISTEN or SYN_SENT.
598: * First check that at least some bytes of segment are within
599: * receive window. If segment begins before rcv_nxt,
600: * drop leading data (and SYN); if nothing left, just ack.
601: */
602: todrop = tp->rcv_nxt - ti->ti_seq;
603: if (todrop > 0) {
604: if (tiflags & TH_SYN) {
605: tiflags &= ~TH_SYN;
606: ti->ti_seq++;
607: if (ti->ti_urp > 1)
608: ti->ti_urp--;
609: else
610: tiflags &= ~TH_URG;
611: todrop--;
612: }
613: if (todrop > ti->ti_len ||
614: todrop == ti->ti_len && (tiflags&TH_FIN) == 0) {
615: tcpstat.tcps_rcvduppack++;
616: tcpstat.tcps_rcvdupbyte += ti->ti_len;
617: /*
618: * If segment is just one to the left of the window,
619: * check two special cases:
620: * 1. Don't toss RST in response to 4.2-style keepalive.
621: * 2. If the only thing to drop is a FIN, we can drop
622: * it, but check the ACK or we will get into FIN
623: * wars if our FINs crossed (both CLOSING).
624: * In either case, send ACK to resynchronize,
625: * but keep on processing for RST or ACK.
626: */
627: if ((tiflags & TH_FIN && todrop == ti->ti_len + 1)
628: #ifdef TCP_COMPAT_42
629: || (tiflags & TH_RST && ti->ti_seq == tp->rcv_nxt - 1)
630: #endif
631: ) {
632: todrop = ti->ti_len;
633: tiflags &= ~TH_FIN;
634: tp->t_flags |= TF_ACKNOW;
635: } else
636: goto dropafterack;
637: } else {
638: tcpstat.tcps_rcvpartduppack++;
639: tcpstat.tcps_rcvpartdupbyte += todrop;
640: }
641: m_adj(m, todrop);
642: ti->ti_seq += todrop;
643: ti->ti_len -= todrop;
644: if (ti->ti_urp > todrop)
645: ti->ti_urp -= todrop;
646: else {
647: tiflags &= ~TH_URG;
648: ti->ti_urp = 0;
649: }
650: }
651:
652: /*
653: * If new data are received on a connection after the
654: * user processes are gone, then RST the other end.
655: */
656: if ((so->so_state & SS_NOFDREF) &&
657: tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len) {
658: tp = tcp_close(tp);
659: tcpstat.tcps_rcvafterclose++;
660: goto dropwithreset;
661: }
662:
663: /*
664: * If segment ends after window, drop trailing data
665: * (and PUSH and FIN); if nothing left, just ACK.
666: */
667: todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd);
668: if (todrop > 0) {
669: tcpstat.tcps_rcvpackafterwin++;
670: if (todrop >= ti->ti_len) {
671: tcpstat.tcps_rcvbyteafterwin += ti->ti_len;
672: /*
673: * If a new connection request is received
674: * while in TIME_WAIT, drop the old connection
675: * and start over if the sequence numbers
676: * are above the previous ones.
677: */
678: if (tiflags & TH_SYN &&
679: tp->t_state == TCPS_TIME_WAIT &&
680: SEQ_GT(ti->ti_seq, tp->rcv_nxt)) {
681: iss = tp->rcv_nxt + TCP_ISSINCR;
682: tp = tcp_close(tp);
683: goto findpcb;
684: }
685: /*
686: * If window is closed can only take segments at
687: * window edge, and have to drop data and PUSH from
688: * incoming segments. Continue processing, but
689: * remember to ack. Otherwise, drop segment
690: * and ack.
691: */
692: if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt) {
693: tp->t_flags |= TF_ACKNOW;
694: tcpstat.tcps_rcvwinprobe++;
695: } else
696: goto dropafterack;
697: } else
698: tcpstat.tcps_rcvbyteafterwin += todrop;
699: m_adj(m, -todrop);
700: ti->ti_len -= todrop;
701: tiflags &= ~(TH_PUSH|TH_FIN);
702: }
703:
704: /*
705: * If the RST bit is set examine the state:
706: * SYN_RECEIVED STATE:
707: * If passive open, return to LISTEN state.
708: * If active open, inform user that connection was refused.
709: * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
710: * Inform user that connection was reset, and close tcb.
711: * CLOSING, LAST_ACK, TIME_WAIT STATES
712: * Close the tcb.
713: */
714: if (tiflags&TH_RST) switch (tp->t_state) {
715:
716: case TCPS_SYN_RECEIVED:
717: so->so_error = ECONNREFUSED;
718: goto close;
719:
720: case TCPS_ESTABLISHED:
721: case TCPS_FIN_WAIT_1:
722: case TCPS_FIN_WAIT_2:
723: case TCPS_CLOSE_WAIT:
724: so->so_error = ECONNRESET;
725: close:
726: tp->t_state = TCPS_CLOSED;
727: tcpstat.tcps_drops++;
728: tp = tcp_close(tp);
729: goto drop;
730:
731: case TCPS_CLOSING:
732: case TCPS_LAST_ACK:
733: case TCPS_TIME_WAIT:
734: tp = tcp_close(tp);
735: goto drop;
736: }
737:
738: /*
739: * If a SYN is in the window, then this is an
740: * error and we send an RST and drop the connection.
741: */
742: if (tiflags & TH_SYN) {
743: tp = tcp_drop(tp, ECONNRESET);
744: goto dropwithreset;
745: }
746:
747: /*
748: * If the ACK bit is off we drop the segment and return.
749: */
750: if ((tiflags & TH_ACK) == 0)
751: goto drop;
752:
753: /*
754: * Ack processing.
755: */
756: switch (tp->t_state) {
757:
758: /*
759: * In SYN_RECEIVED state if the ack ACKs our SYN then enter
760: * ESTABLISHED state and continue processing, otherwise
761: * send an RST.
762: */
763: case TCPS_SYN_RECEIVED:
764: if (SEQ_GT(tp->snd_una, ti->ti_ack) ||
765: SEQ_GT(ti->ti_ack, tp->snd_max))
766: goto dropwithreset;
767: tcpstat.tcps_connects++;
768: soisconnected(so);
769: tp->t_state = TCPS_ESTABLISHED;
770: (void) tcp_reass(tp, (struct tcpiphdr *)0, (struct mbuf *)0);
771: tp->snd_wl1 = ti->ti_seq - 1;
772: /* fall into ... */
773:
774: /*
775: * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
776: * ACKs. If the ack is in the range
777: * tp->snd_una < ti->ti_ack <= tp->snd_max
778: * then advance tp->snd_una to ti->ti_ack and drop
779: * data from the retransmission queue. If this ACK reflects
780: * more up to date window information we update our window information.
781: */
782: case TCPS_ESTABLISHED:
783: case TCPS_FIN_WAIT_1:
784: case TCPS_FIN_WAIT_2:
785: case TCPS_CLOSE_WAIT:
786: case TCPS_CLOSING:
787: case TCPS_LAST_ACK:
788: case TCPS_TIME_WAIT:
789:
790: if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) {
791: if (ti->ti_len == 0 && ti->ti_win == tp->snd_wnd) {
792: tcpstat.tcps_rcvdupack++;
793: /*
794: * If we have outstanding data (other than
795: * a window probe), this is a completely
796: * duplicate ack (ie, window info didn't
797: * change), the ack is the biggest we've
798: * seen and we've seen exactly our rexmt
799: * threshhold of them, assume a packet
800: * has been dropped and retransmit it.
801: * Kludge snd_nxt & the congestion
802: * window so we send only this one
803: * packet.
804: *
805: * We know we're losing at the current
806: * window size so do congestion avoidance
807: * (set ssthresh to half the current window
808: * and pull our congestion window back to
809: * the new ssthresh).
810: *
811: * Dup acks mean that packets have left the
812: * network (they're now cached at the receiver)
813: * so bump cwnd by the amount in the receiver
814: * to keep a constant cwnd packets in the
815: * network.
816: */
817: if (tp->t_timer[TCPT_REXMT] == 0 ||
818: ti->ti_ack != tp->snd_una)
819: tp->t_dupacks = 0;
820: else if (++tp->t_dupacks == tcprexmtthresh) {
821: tcp_seq onxt = tp->snd_nxt;
822: u_int win =
823: min(tp->snd_wnd, tp->snd_cwnd) / 2 /
824: tp->t_maxseg;
825:
826: if (win < 2)
827: win = 2;
828: tp->snd_ssthresh = win * tp->t_maxseg;
829: tp->t_timer[TCPT_REXMT] = 0;
830: tp->t_rtt = 0;
831: tp->snd_nxt = ti->ti_ack;
832: tp->snd_cwnd = tp->t_maxseg;
833: (void) tcp_output(tp);
834: tp->snd_cwnd = tp->snd_ssthresh +
835: tp->t_maxseg * tp->t_dupacks;
836: if (SEQ_GT(onxt, tp->snd_nxt))
837: tp->snd_nxt = onxt;
838: goto drop;
839: } else if (tp->t_dupacks > tcprexmtthresh) {
840: tp->snd_cwnd += tp->t_maxseg;
841: (void) tcp_output(tp);
842: goto drop;
843: }
844: } else
845: tp->t_dupacks = 0;
846: break;
847: }
848: /*
849: * If the congestion window was inflated to account
850: * for the other side's cached packets, retract it.
851: */
852: if (tp->t_dupacks > tcprexmtthresh &&
853: tp->snd_cwnd > tp->snd_ssthresh)
854: tp->snd_cwnd = tp->snd_ssthresh;
855: tp->t_dupacks = 0;
856: if (SEQ_GT(ti->ti_ack, tp->snd_max)) {
857: tcpstat.tcps_rcvacktoomuch++;
858: goto dropafterack;
859: }
860: acked = ti->ti_ack - tp->snd_una;
861: tcpstat.tcps_rcvackpack++;
862: tcpstat.tcps_rcvackbyte += acked;
863:
864: /*
865: * If transmit timer is running and timed sequence
866: * number was acked, update smoothed round trip time.
867: * Since we now have an rtt measurement, cancel the
868: * timer backoff (cf., Phil Karn's retransmit alg.).
869: * Recompute the initial retransmit timer.
870: */
871: if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq))
872: tcp_xmit_timer(tp);
873:
874: /*
875: * If all outstanding data is acked, stop retransmit
876: * timer and remember to restart (more output or persist).
877: * If there is more data to be acked, restart retransmit
878: * timer, using current (possibly backed-off) value.
879: */
880: if (ti->ti_ack == tp->snd_max) {
881: tp->t_timer[TCPT_REXMT] = 0;
882: needoutput = 1;
883: } else if (tp->t_timer[TCPT_PERSIST] == 0)
884: tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
885: /*
886: * When new data is acked, open the congestion window.
887: * If the window gives us less than ssthresh packets
888: * in flight, open exponentially (maxseg per packet).
889: * Otherwise open linearly: maxseg per window
890: * (maxseg^2 / cwnd per packet), plus a constant
891: * fraction of a packet (maxseg/8) to help larger windows
892: * open quickly enough.
893: */
894: {
895: register u_int cw = tp->snd_cwnd;
896: register u_int incr = tp->t_maxseg;
897:
898: if (cw > tp->snd_ssthresh)
899: incr = incr * incr / cw + incr / 8;
900: tp->snd_cwnd = min(cw + incr, TCP_MAXWIN);
901: }
902: if (acked > so->so_snd.sb_cc) {
903: tp->snd_wnd -= so->so_snd.sb_cc;
904: sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
905: ourfinisacked = 1;
906: } else {
907: sbdrop(&so->so_snd, acked);
908: tp->snd_wnd -= acked;
909: ourfinisacked = 0;
910: }
911: if (so->so_snd.sb_flags & SB_NOTIFY)
912: sowwakeup(so);
913: tp->snd_una = ti->ti_ack;
914: if (SEQ_LT(tp->snd_nxt, tp->snd_una))
915: tp->snd_nxt = tp->snd_una;
916:
917: switch (tp->t_state) {
918:
919: /*
920: * In FIN_WAIT_1 STATE in addition to the processing
921: * for the ESTABLISHED state if our FIN is now acknowledged
922: * then enter FIN_WAIT_2.
923: */
924: case TCPS_FIN_WAIT_1:
925: if (ourfinisacked) {
926: /*
927: * If we can't receive any more
928: * data, then closing user can proceed.
929: * Starting the timer is contrary to the
930: * specification, but if we don't get a FIN
931: * we'll hang forever.
932: */
933: if (so->so_state & SS_CANTRCVMORE) {
934: soisdisconnected(so);
935: tp->t_timer[TCPT_2MSL] = tcp_maxidle;
936: }
937: tp->t_state = TCPS_FIN_WAIT_2;
938: }
939: break;
940:
941: /*
942: * In CLOSING STATE in addition to the processing for
943: * the ESTABLISHED state if the ACK acknowledges our FIN
944: * then enter the TIME-WAIT state, otherwise ignore
945: * the segment.
946: */
947: case TCPS_CLOSING:
948: if (ourfinisacked) {
949: tp->t_state = TCPS_TIME_WAIT;
950: tcp_canceltimers(tp);
951: tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
952: soisdisconnected(so);
953: }
954: break;
955:
956: /*
957: * In LAST_ACK, we may still be waiting for data to drain
958: * and/or to be acked, as well as for the ack of our FIN.
959: * If our FIN is now acknowledged, delete the TCB,
960: * enter the closed state and return.
961: */
962: case TCPS_LAST_ACK:
963: if (ourfinisacked) {
964: tp = tcp_close(tp);
965: goto drop;
966: }
967: break;
968:
969: /*
970: * In TIME_WAIT state the only thing that should arrive
971: * is a retransmission of the remote FIN. Acknowledge
972: * it and restart the finack timer.
973: */
974: case TCPS_TIME_WAIT:
975: tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
976: goto dropafterack;
977: }
978: }
979:
980: step6:
981: /*
982: * Update window information.
983: * Don't look at window if no ACK: TAC's send garbage on first SYN.
984: */
985: if ((tiflags & TH_ACK) &&
986: (SEQ_LT(tp->snd_wl1, ti->ti_seq) || tp->snd_wl1 == ti->ti_seq &&
987: (SEQ_LT(tp->snd_wl2, ti->ti_ack) ||
988: tp->snd_wl2 == ti->ti_ack && ti->ti_win > tp->snd_wnd))) {
989: /* keep track of pure window updates */
990: if (ti->ti_len == 0 &&
991: tp->snd_wl2 == ti->ti_ack && ti->ti_win > tp->snd_wnd)
992: tcpstat.tcps_rcvwinupd++;
993: tp->snd_wnd = ti->ti_win;
994: tp->snd_wl1 = ti->ti_seq;
995: tp->snd_wl2 = ti->ti_ack;
996: if (tp->snd_wnd > tp->max_sndwnd)
997: tp->max_sndwnd = tp->snd_wnd;
998: needoutput = 1;
999: }
1000:
1001: /*
1002: * Process segments with URG.
1003: */
1004: if ((tiflags & TH_URG) && ti->ti_urp &&
1005: TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1006: /*
1007: * This is a kludge, but if we receive and accept
1008: * random urgent pointers, we'll crash in
1009: * soreceive. It's hard to imagine someone
1010: * actually wanting to send this much urgent data.
1011: */
1012: if (ti->ti_urp + so->so_rcv.sb_cc > SB_MAX) {
1013: ti->ti_urp = 0; /* XXX */
1014: tiflags &= ~TH_URG; /* XXX */
1015: goto dodata; /* XXX */
1016: }
1017: /*
1018: * If this segment advances the known urgent pointer,
1019: * then mark the data stream. This should not happen
1020: * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
1021: * a FIN has been received from the remote side.
1022: * In these states we ignore the URG.
1023: *
1024: * According to RFC961 (Assigned Protocols),
1025: * the urgent pointer points to the last octet
1026: * of urgent data. We continue, however,
1027: * to consider it to indicate the first octet
1028: * of data past the urgent section as the original
1029: * spec states (in one of two places).
1030: */
1031: if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up)) {
1032: tp->rcv_up = ti->ti_seq + ti->ti_urp;
1033: so->so_oobmark = so->so_rcv.sb_cc +
1034: (tp->rcv_up - tp->rcv_nxt) - 1;
1035: if (so->so_oobmark == 0)
1036: so->so_state |= SS_RCVATMARK;
1037: sohasoutofband(so);
1038: tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
1039: }
1040: /*
1041: * Remove out of band data so doesn't get presented to user.
1042: * This can happen independent of advancing the URG pointer,
1043: * but if two URG's are pending at once, some out-of-band
1044: * data may creep in... ick.
1045: */
1046: if (ti->ti_urp <= ti->ti_len
1047: #ifdef SO_OOBINLINE
1048: && (so->so_options & SO_OOBINLINE) == 0
1049: #endif
1050: )
1051: tcp_pulloutofband(so, ti, m);
1052: } else
1053: /*
1054: * If no out of band data is expected,
1055: * pull receive urgent pointer along
1056: * with the receive window.
1057: */
1058: if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
1059: tp->rcv_up = tp->rcv_nxt;
1060: dodata: /* XXX */
1061:
1062: /*
1063: * Process the segment text, merging it into the TCP sequencing queue,
1064: * and arranging for acknowledgment of receipt if necessary.
1065: * This process logically involves adjusting tp->rcv_wnd as data
1066: * is presented to the user (this happens in tcp_usrreq.c,
1067: * case PRU_RCVD). If a FIN has already been received on this
1068: * connection then we just ignore the text.
1069: */
1070: if ((ti->ti_len || (tiflags&TH_FIN)) &&
1071: TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1072: TCP_REASS(tp, ti, m, so, tiflags);
1073: /*
1074: * Note the amount of data that peer has sent into
1075: * our window, in order to estimate the sender's
1076: * buffer size.
1077: */
1078: len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
1079: } else {
1080: m_freem(m);
1081: tiflags &= ~TH_FIN;
1082: }
1083:
1084: /*
1085: * If FIN is received ACK the FIN and let the user know
1086: * that the connection is closing.
1087: */
1088: if (tiflags & TH_FIN) {
1089: if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1090: socantrcvmore(so);
1091: tp->t_flags |= TF_ACKNOW;
1092: tp->rcv_nxt++;
1093: }
1094: switch (tp->t_state) {
1095:
1096: /*
1097: * In SYN_RECEIVED and ESTABLISHED STATES
1098: * enter the CLOSE_WAIT state.
1099: */
1100: case TCPS_SYN_RECEIVED:
1101: case TCPS_ESTABLISHED:
1102: tp->t_state = TCPS_CLOSE_WAIT;
1103: break;
1104:
1105: /*
1106: * If still in FIN_WAIT_1 STATE FIN has not been acked so
1107: * enter the CLOSING state.
1108: */
1109: case TCPS_FIN_WAIT_1:
1110: tp->t_state = TCPS_CLOSING;
1111: break;
1112:
1113: /*
1114: * In FIN_WAIT_2 state enter the TIME_WAIT state,
1115: * starting the time-wait timer, turning off the other
1116: * standard timers.
1117: */
1118: case TCPS_FIN_WAIT_2:
1119: tp->t_state = TCPS_TIME_WAIT;
1120: tcp_canceltimers(tp);
1121: tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1122: soisdisconnected(so);
1123: break;
1124:
1125: /*
1126: * In TIME_WAIT state restart the 2 MSL time_wait timer.
1127: */
1128: case TCPS_TIME_WAIT:
1129: tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1130: break;
1131: }
1132: }
1133: if (so->so_options & SO_DEBUG)
1134: tcp_trace(TA_INPUT, ostate, tp, &tcp_saveti, 0);
1135:
1136: /*
1137: * Return any desired output.
1138: */
1139: if (needoutput || (tp->t_flags & TF_ACKNOW))
1140: (void) tcp_output(tp);
1141: return;
1142:
1143: dropafterack:
1144: /*
1145: * Generate an ACK dropping incoming segment if it occupies
1146: * sequence space, where the ACK reflects our state.
1147: */
1148: if (tiflags & TH_RST)
1149: goto drop;
1150: m_freem(m);
1151: tp->t_flags |= TF_ACKNOW;
1152: (void) tcp_output(tp);
1153: return;
1154:
1155: dropwithreset:
1156: if (om) {
1157: (void) m_free(om);
1158: om = 0;
1159: }
1160: /*
1161: * Generate a RST, dropping incoming segment.
1162: * Make ACK acceptable to originator of segment.
1163: * Don't bother to respond if destination was broadcast.
1164: */
1165: if ((tiflags & TH_RST) || m->m_flags & M_BCAST)
1166: goto drop;
1167: if (tiflags & TH_ACK)
1168: tcp_respond(tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST);
1169: else {
1170: if (tiflags & TH_SYN)
1171: ti->ti_len++;
1172: tcp_respond(tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0,
1173: TH_RST|TH_ACK);
1174: }
1175: /* destroy temporarily created socket */
1176: if (dropsocket)
1177: (void) soabort(so);
1178: return;
1179:
1180: drop:
1181: if (om)
1182: (void) m_free(om);
1183: /*
1184: * Drop space held by incoming segment and return.
1185: */
1186: if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
1187: tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0);
1188: m_freem(m);
1189: /* destroy temporarily created socket */
1190: if (dropsocket)
1191: (void) soabort(so);
1192: return;
1193: }
1194:
1195: tcp_dooptions(tp, om, ti)
1196: struct tcpcb *tp;
1197: struct mbuf *om;
1198: struct tcpiphdr *ti;
1199: {
1200: register u_char *cp;
1201: u_short mss;
1202: int opt, optlen, cnt;
1203:
1204: cp = mtod(om, u_char *);
1205: cnt = om->m_len;
1206: for (; cnt > 0; cnt -= optlen, cp += optlen) {
1207: opt = cp[0];
1208: if (opt == TCPOPT_EOL)
1209: break;
1210: if (opt == TCPOPT_NOP)
1211: optlen = 1;
1212: else {
1213: optlen = cp[1];
1214: if (optlen <= 0)
1215: break;
1216: }
1217: switch (opt) {
1218:
1219: default:
1220: continue;
1221:
1222: case TCPOPT_MAXSEG:
1223: if (optlen != 4)
1224: continue;
1225: if (!(ti->ti_flags & TH_SYN))
1226: continue;
1227: bcopy((char *) cp + 2, (char *) &mss, sizeof(mss));
1228: NTOHS(mss);
1229: (void) tcp_mss(tp, mss); /* sets t_maxseg */
1230: break;
1231: }
1232: }
1233: (void) m_free(om);
1234: }
1235:
1236: /*
1237: * Pull out of band byte out of a segment so
1238: * it doesn't appear in the user's data queue.
1239: * It is still reflected in the segment length for
1240: * sequencing purposes.
1241: */
1242: tcp_pulloutofband(so, ti, m)
1243: struct socket *so;
1244: struct tcpiphdr *ti;
1245: register struct mbuf *m;
1246: {
1247: int cnt = ti->ti_urp - 1;
1248:
1249: while (cnt >= 0) {
1250: if (m->m_len > cnt) {
1251: char *cp = mtod(m, caddr_t) + cnt;
1252: struct tcpcb *tp = sototcpcb(so);
1253:
1254: tp->t_iobc = *cp;
1255: tp->t_oobflags |= TCPOOB_HAVEDATA;
1256: bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
1257: m->m_len--;
1258: return;
1259: }
1260: cnt -= m->m_len;
1261: m = m->m_next;
1262: if (m == 0)
1263: break;
1264: }
1265: panic("tcp_pulloutofband");
1266: }
1267:
1268: /*
1269: * Collect new round-trip time estimate
1270: * and update averages and current timeout.
1271: */
1272: tcp_xmit_timer(tp)
1273: register struct tcpcb *tp;
1274: {
1275: register short delta;
1276:
1277: tcpstat.tcps_rttupdated++;
1278: if (tp->t_srtt != 0) {
1279: /*
1280: * srtt is stored as fixed point with 3 bits after the
1281: * binary point (i.e., scaled by 8). The following magic
1282: * is equivalent to the smoothing algorithm in rfc793 with
1283: * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
1284: * point). Adjust t_rtt to origin 0.
1285: */
1286: delta = tp->t_rtt - 1 - (tp->t_srtt >> TCP_RTT_SHIFT);
1287: if ((tp->t_srtt += delta) <= 0)
1288: tp->t_srtt = 1;
1289: /*
1290: * We accumulate a smoothed rtt variance (actually, a
1291: * smoothed mean difference), then set the retransmit
1292: * timer to smoothed rtt + 4 times the smoothed variance.
1293: * rttvar is stored as fixed point with 2 bits after the
1294: * binary point (scaled by 4). The following is
1295: * equivalent to rfc793 smoothing with an alpha of .75
1296: * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
1297: * rfc793's wired-in beta.
1298: */
1299: if (delta < 0)
1300: delta = -delta;
1301: delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
1302: if ((tp->t_rttvar += delta) <= 0)
1303: tp->t_rttvar = 1;
1304: } else {
1305: /*
1306: * No rtt measurement yet - use the unsmoothed rtt.
1307: * Set the variance to half the rtt (so our first
1308: * retransmit happens at 2*rtt)
1309: */
1310: tp->t_srtt = tp->t_rtt << TCP_RTT_SHIFT;
1311: tp->t_rttvar = tp->t_rtt << (TCP_RTTVAR_SHIFT - 1);
1312: }
1313: tp->t_rtt = 0;
1314: tp->t_rxtshift = 0;
1315:
1316: /*
1317: * the retransmit should happen at rtt + 4 * rttvar.
1318: * Because of the way we do the smoothing, srtt and rttvar
1319: * will each average +1/2 tick of bias. When we compute
1320: * the retransmit timer, we want 1/2 tick of rounding and
1321: * 1 extra tick because of +-1/2 tick uncertainty in the
1322: * firing of the timer. The bias will give us exactly the
1323: * 1.5 tick we need. But, because the bias is
1324: * statistical, we have to test that we don't drop below
1325: * the minimum feasible timer (which is 2 ticks).
1326: */
1327: TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
1328: tp->t_rttmin, TCPTV_REXMTMAX);
1329:
1330: /*
1331: * We received an ack for a packet that wasn't retransmitted;
1332: * it is probably safe to discard any error indications we've
1333: * received recently. This isn't quite right, but close enough
1334: * for now (a route might have failed after we sent a segment,
1335: * and the return path might not be symmetrical).
1336: */
1337: tp->t_softerror = 0;
1338: }
1339:
1340: /*
1341: * Determine a reasonable value for maxseg size.
1342: * If the route is known, check route for mtu.
1343: * If none, use an mss that can be handled on the outgoing
1344: * interface without forcing IP to fragment; if bigger than
1345: * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
1346: * to utilize large mbufs. If no route is found, route has no mtu,
1347: * or the destination isn't local, use a default, hopefully conservative
1348: * size (usually 512 or the default IP max size, but no more than the mtu
1349: * of the interface), as we can't discover anything about intervening
1350: * gateways or networks. We also initialize the congestion/slow start
1351: * window to be a single segment if the destination isn't local.
1352: * While looking at the routing entry, we also initialize other path-dependent
1353: * parameters from pre-set or cached values in the routing entry.
1354: */
1355:
1356: tcp_mss(tp, offer)
1357: register struct tcpcb *tp;
1358: u_short offer;
1359: {
1360: struct route *ro;
1361: register struct rtentry *rt;
1362: struct ifnet *ifp;
1363: register int rtt, mss;
1364: u_long bufsize;
1365: struct inpcb *inp;
1366: struct socket *so;
1367: extern int tcp_mssdflt, tcp_rttdflt;
1368:
1369: inp = tp->t_inpcb;
1370: ro = &inp->inp_route;
1371:
1372: if ((rt = ro->ro_rt) == (struct rtentry *)0) {
1373: /* No route yet, so try to acquire one */
1374: if (inp->inp_faddr.s_addr != INADDR_ANY) {
1375: ro->ro_dst.sa_family = AF_INET;
1376: ro->ro_dst.sa_len = sizeof(ro->ro_dst);
1377: ((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
1378: inp->inp_faddr;
1379: rtalloc(ro);
1380: }
1381: if ((rt = ro->ro_rt) == (struct rtentry *)0)
1382: return (tcp_mssdflt);
1383: }
1384: ifp = rt->rt_ifp;
1385: so = inp->inp_socket;
1386:
1387: #ifdef RTV_MTU /* if route characteristics exist ... */
1388: /*
1389: * While we're here, check if there's an initial rtt
1390: * or rttvar. Convert from the route-table units
1391: * to scaled multiples of the slow timeout timer.
1392: */
1393: if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
1394: if (rt->rt_rmx.rmx_locks & RTV_MTU)
1395: tp->t_rttmin = rtt / (RTM_RTTUNIT / PR_SLOWHZ);
1396: tp->t_srtt = rtt / (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTT_SCALE));
1397: if (rt->rt_rmx.rmx_rttvar)
1398: tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
1399: (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTTVAR_SCALE));
1400: else
1401: /* default variation is +- 1 rtt */
1402: tp->t_rttvar =
1403: tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
1404: TCPT_RANGESET(tp->t_rxtcur,
1405: ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
1406: tp->t_rttmin, TCPTV_REXMTMAX);
1407: }
1408: /*
1409: * if there's an mtu associated with the route, use it
1410: */
1411: if (rt->rt_rmx.rmx_mtu)
1412: mss = rt->rt_rmx.rmx_mtu - sizeof(struct tcpiphdr);
1413: else
1414: #endif /* RTV_MTU */
1415: {
1416: mss = ifp->if_mtu - sizeof(struct tcpiphdr);
1417: #if (MCLBYTES & (MCLBYTES - 1)) == 0
1418: if (mss > MCLBYTES)
1419: mss &= ~(MCLBYTES-1);
1420: #else
1421: if (mss > MCLBYTES)
1422: mss = mss / MCLBYTES * MCLBYTES;
1423: #endif
1424: if (!in_localaddr(inp->inp_faddr))
1425: mss = min(mss, tcp_mssdflt);
1426: }
1427: /*
1428: * The current mss, t_maxseg, is initialized to the default value.
1429: * If we compute a smaller value, reduce the current mss.
1430: * If we compute a larger value, return it for use in sending
1431: * a max seg size option, but don't store it for use
1432: * unless we received an offer at least that large from peer.
1433: * However, do not accept offers under 32 bytes.
1434: */
1435: if (offer)
1436: mss = min(mss, offer);
1437: mss = max(mss, 32); /* sanity */
1438: if (mss < tp->t_maxseg || offer != 0) {
1439: /*
1440: * If there's a pipesize, change the socket buffer
1441: * to that size. Make the socket buffers an integral
1442: * number of mss units; if the mss is larger than
1443: * the socket buffer, decrease the mss.
1444: */
1445: #ifdef RTV_SPIPE
1446: if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0)
1447: #endif
1448: bufsize = so->so_snd.sb_hiwat;
1449: if (bufsize < mss)
1450: mss = bufsize;
1451: else {
1452: bufsize = min(bufsize, SB_MAX) / mss * mss;
1453: (void) sbreserve(&so->so_snd, bufsize);
1454: }
1455: tp->t_maxseg = mss;
1456:
1457: #ifdef RTV_RPIPE
1458: if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0)
1459: #endif
1460: bufsize = so->so_rcv.sb_hiwat;
1461: if (bufsize > mss) {
1462: bufsize = min(bufsize, SB_MAX) / mss * mss;
1463: (void) sbreserve(&so->so_rcv, bufsize);
1464: }
1465: }
1466: tp->snd_cwnd = mss;
1467:
1468: #ifdef RTV_SSTHRESH
1469: if (rt->rt_rmx.rmx_ssthresh) {
1470: /*
1471: * There's some sort of gateway or interface
1472: * buffer limit on the path. Use this to set
1473: * the slow start threshhold, but set the
1474: * threshold to no less than 2*mss.
1475: */
1476: tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
1477: }
1478: #endif /* RTV_MTU */
1479: return (mss);
1480: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.