Annotation of sys/netinet/tcp_input.c, Revision 1.1.1.1
1.1 nbrk 1: /* $OpenBSD: tcp_input.c,v 1.207 2007/06/15 18:23:06 markus Exp $ */
2: /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */
3:
4: /*
5: * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994
6: * The Regents of the University of California. All rights reserved.
7: *
8: * Redistribution and use in source and binary forms, with or without
9: * modification, are permitted provided that the following conditions
10: * are met:
11: * 1. Redistributions of source code must retain the above copyright
12: * notice, this list of conditions and the following disclaimer.
13: * 2. Redistributions in binary form must reproduce the above copyright
14: * notice, this list of conditions and the following disclaimer in the
15: * documentation and/or other materials provided with the distribution.
16: * 3. Neither the name of the University nor the names of its contributors
17: * may be used to endorse or promote products derived from this software
18: * without specific prior written permission.
19: *
20: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30: * SUCH DAMAGE.
31: *
32: * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
33: *
34: * NRL grants permission for redistribution and use in source and binary
35: * forms, with or without modification, of the software and documentation
36: * created at NRL provided that the following conditions are met:
37: *
38: * 1. Redistributions of source code must retain the above copyright
39: * notice, this list of conditions and the following disclaimer.
40: * 2. Redistributions in binary form must reproduce the above copyright
41: * notice, this list of conditions and the following disclaimer in the
42: * documentation and/or other materials provided with the distribution.
43: * 3. All advertising materials mentioning features or use of this software
44: * must display the following acknowledgements:
45: * This product includes software developed by the University of
46: * California, Berkeley and its contributors.
47: * This product includes software developed at the Information
48: * Technology Division, US Naval Research Laboratory.
49: * 4. Neither the name of the NRL nor the names of its contributors
50: * may be used to endorse or promote products derived from this software
51: * without specific prior written permission.
52: *
53: * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
54: * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
55: * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
56: * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
57: * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
58: * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
59: * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
60: * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
61: * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
62: * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
63: * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
64: *
65: * The views and conclusions contained in the software and documentation
66: * are those of the authors and should not be interpreted as representing
67: * official policies, either expressed or implied, of the US Naval
68: * Research Laboratory (NRL).
69: */
70:
71: #include <sys/param.h>
72: #include <sys/systm.h>
73: #include <sys/mbuf.h>
74: #include <sys/protosw.h>
75: #include <sys/socket.h>
76: #include <sys/socketvar.h>
77: #include <sys/kernel.h>
78:
79: #include <dev/rndvar.h>
80:
81: #include <net/if.h>
82: #include <net/route.h>
83:
84: #include <netinet/in.h>
85: #include <netinet/in_systm.h>
86: #include <netinet/ip.h>
87: #include <netinet/in_pcb.h>
88: #include <netinet/ip_var.h>
89: #include <netinet/tcp.h>
90: #include <netinet/tcp_fsm.h>
91: #include <netinet/tcp_seq.h>
92: #include <netinet/tcp_timer.h>
93: #include <netinet/tcp_var.h>
94: #include <netinet/tcpip.h>
95: #include <netinet/tcp_debug.h>
96:
97: struct tcpiphdr tcp_saveti;
98:
99: int tcp_mss_adv(struct ifnet *, int);
100:
101: #ifdef INET6
102: #include <netinet6/in6_var.h>
103: #include <netinet6/nd6.h>
104:
105: struct tcpipv6hdr tcp_saveti6;
106:
107: /* for the packet header length in the mbuf */
108: #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len)
109: #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr))
110: #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip))
111: #endif /* INET6 */
112:
113: int tcprexmtthresh = 3;
114: int tcptv_keep_init = TCPTV_KEEP_INIT;
115:
116: extern u_long sb_max;
117:
118: int tcp_rst_ppslim = 100; /* 100pps */
119: int tcp_rst_ppslim_count = 0;
120: struct timeval tcp_rst_ppslim_last;
121:
122: int tcp_ackdrop_ppslim = 100; /* 100pps */
123: int tcp_ackdrop_ppslim_count = 0;
124: struct timeval tcp_ackdrop_ppslim_last;
125:
126: #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ)
127:
128: /* for modulo comparisons of timestamps */
129: #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0)
130: #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0)
131:
132: /* for TCP SACK comparisons */
133: #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b))
134: #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b))
135:
136: /*
137: * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint.
138: */
139: #ifdef INET6
140: #define ND6_HINT(tp) \
141: do { \
142: if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \
143: tp->t_inpcb->inp_route6.ro_rt) { \
144: nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt, NULL, 0); \
145: } \
146: } while (0)
147: #else
148: #define ND6_HINT(tp)
149: #endif
150:
151: #ifdef TCP_ECN
152: /*
153: * ECN (Explicit Congestion Notification) support based on RFC3168
154: * implementation note:
155: * snd_last is used to track a recovery phase.
156: * when cwnd is reduced, snd_last is set to snd_max.
157: * while snd_last > snd_una, the sender is in a recovery phase and
158: * its cwnd should not be reduced again.
159: * snd_last follows snd_una when not in a recovery phase.
160: */
161: #endif
162:
163: /*
164: * Macro to compute ACK transmission behavior. Delay the ACK unless
165: * we have already delayed an ACK (must send an ACK every two segments).
166: * We also ACK immediately if we received a PUSH and the ACK-on-PUSH
167: * option is enabled.
168: */
169: #define TCP_SETUP_ACK(tp, tiflags) \
170: do { \
171: if ((tp)->t_flags & TF_DELACK || \
172: (tcp_ack_on_push && (tiflags) & TH_PUSH)) \
173: tp->t_flags |= TF_ACKNOW; \
174: else \
175: TCP_SET_DELACK(tp); \
176: } while (0)
177:
178: /*
179: * Insert segment ti into reassembly queue of tcp with
180: * control block tp. Return TH_FIN if reassembly now includes
181: * a segment with FIN. The macro form does the common case inline
182: * (segment is the next to be received on an established connection,
183: * and the queue is empty), avoiding linkage into and removal
184: * from the queue and repetition of various conversions.
185: * Set DELACK for segments received in order, but ack immediately
186: * when segments are out of order (so fast retransmit can work).
187: */
188:
189: int
190: tcp_reass(tp, th, m, tlen)
191: struct tcpcb *tp;
192: struct tcphdr *th;
193: struct mbuf *m;
194: int *tlen;
195: {
196: struct tcpqent *p, *q, *nq, *tiqe;
197: struct socket *so = tp->t_inpcb->inp_socket;
198: int flags;
199:
200: /*
201: * Call with th==0 after become established to
202: * force pre-ESTABLISHED data up to user socket.
203: */
204: if (th == 0)
205: goto present;
206:
207: /*
208: * Allocate a new queue entry, before we throw away any data.
209: * If we can't, just drop the packet. XXX
210: */
211: tiqe = pool_get(&tcpqe_pool, PR_NOWAIT);
212: if (tiqe == NULL) {
213: tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead);
214: if (tiqe != NULL && th->th_seq == tp->rcv_nxt) {
215: /* Reuse last entry since new segment fills a hole */
216: m_freem(tiqe->tcpqe_m);
217: TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q);
218: }
219: if (tiqe == NULL || th->th_seq != tp->rcv_nxt) {
220: /* Flush segment queue for this connection */
221: tcp_freeq(tp);
222: tcpstat.tcps_rcvmemdrop++;
223: m_freem(m);
224: return (0);
225: }
226: }
227:
228: /*
229: * Find a segment which begins after this one does.
230: */
231: for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL;
232: p = q, q = TAILQ_NEXT(q, tcpqe_q))
233: if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq))
234: break;
235:
236: /*
237: * If there is a preceding segment, it may provide some of
238: * our data already. If so, drop the data from the incoming
239: * segment. If it provides all of our data, drop us.
240: */
241: if (p != NULL) {
242: struct tcphdr *phdr = p->tcpqe_tcp;
243: int i;
244:
245: /* conversion to int (in i) handles seq wraparound */
246: i = phdr->th_seq + phdr->th_reseqlen - th->th_seq;
247: if (i > 0) {
248: if (i >= *tlen) {
249: tcpstat.tcps_rcvduppack++;
250: tcpstat.tcps_rcvdupbyte += *tlen;
251: m_freem(m);
252: pool_put(&tcpqe_pool, tiqe);
253: return (0);
254: }
255: m_adj(m, i);
256: *tlen -= i;
257: th->th_seq += i;
258: }
259: }
260: tcpstat.tcps_rcvoopack++;
261: tcpstat.tcps_rcvoobyte += *tlen;
262:
263: /*
264: * While we overlap succeeding segments trim them or,
265: * if they are completely covered, dequeue them.
266: */
267: for (; q != NULL; q = nq) {
268: struct tcphdr *qhdr = q->tcpqe_tcp;
269: int i = (th->th_seq + *tlen) - qhdr->th_seq;
270:
271: if (i <= 0)
272: break;
273: if (i < qhdr->th_reseqlen) {
274: qhdr->th_seq += i;
275: qhdr->th_reseqlen -= i;
276: m_adj(q->tcpqe_m, i);
277: break;
278: }
279: nq = TAILQ_NEXT(q, tcpqe_q);
280: m_freem(q->tcpqe_m);
281: TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q);
282: pool_put(&tcpqe_pool, q);
283: }
284:
285: /* Insert the new segment queue entry into place. */
286: tiqe->tcpqe_m = m;
287: th->th_reseqlen = *tlen;
288: tiqe->tcpqe_tcp = th;
289: if (p == NULL) {
290: TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q);
291: } else {
292: TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q);
293: }
294:
295: present:
296: /*
297: * Present data to user, advancing rcv_nxt through
298: * completed sequence space.
299: */
300: if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
301: return (0);
302: q = TAILQ_FIRST(&tp->t_segq);
303: if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt)
304: return (0);
305: if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen)
306: return (0);
307: do {
308: tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen;
309: flags = q->tcpqe_tcp->th_flags & TH_FIN;
310:
311: nq = TAILQ_NEXT(q, tcpqe_q);
312: TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q);
313: ND6_HINT(tp);
314: if (so->so_state & SS_CANTRCVMORE)
315: m_freem(q->tcpqe_m);
316: else
317: sbappendstream(&so->so_rcv, q->tcpqe_m);
318: pool_put(&tcpqe_pool, q);
319: q = nq;
320: } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt);
321: sorwakeup(so);
322: return (flags);
323: }
324:
325: #ifdef INET6
326: int
327: tcp6_input(mp, offp, proto)
328: struct mbuf **mp;
329: int *offp, proto;
330: {
331: struct mbuf *m = *mp;
332:
333: #if defined(NFAITH) && 0 < NFAITH
334: if (m->m_pkthdr.rcvif) {
335: if (m->m_pkthdr.rcvif->if_type == IFT_FAITH) {
336: /* XXX send icmp6 host/port unreach? */
337: m_freem(m);
338: return IPPROTO_DONE;
339: }
340: }
341: #endif
342:
343: /*
344: * draft-itojun-ipv6-tcp-to-anycast
345: * better place to put this in?
346: */
347: if (m->m_flags & M_ANYCAST6) {
348: if (m->m_len >= sizeof(struct ip6_hdr)) {
349: struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
350: icmp6_error(m, ICMP6_DST_UNREACH,
351: ICMP6_DST_UNREACH_ADDR,
352: (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
353: } else
354: m_freem(m);
355: return IPPROTO_DONE;
356: }
357:
358: tcp_input(m, *offp, proto);
359: return IPPROTO_DONE;
360: }
361: #endif
362:
363: /*
364: * TCP input routine, follows pages 65-76 of the
365: * protocol specification dated September, 1981 very closely.
366: */
367: void
368: tcp_input(struct mbuf *m, ...)
369: {
370: struct ip *ip;
371: struct inpcb *inp;
372: u_int8_t *optp = NULL;
373: int optlen = 0;
374: int tlen, off;
375: struct tcpcb *tp = 0;
376: int tiflags;
377: struct socket *so = NULL;
378: int todrop, acked, ourfinisacked, needoutput = 0;
379: int hdroptlen = 0;
380: short ostate = 0;
381: tcp_seq iss, *reuse = NULL;
382: u_long tiwin;
383: struct tcp_opt_info opti;
384: int iphlen;
385: va_list ap;
386: struct tcphdr *th;
387: #ifdef INET6
388: struct ip6_hdr *ip6 = NULL;
389: #endif /* INET6 */
390: #ifdef IPSEC
391: struct m_tag *mtag;
392: struct tdb_ident *tdbi;
393: struct tdb *tdb;
394: int error, s;
395: #endif /* IPSEC */
396: int af;
397: #ifdef TCP_ECN
398: u_char iptos;
399: #endif
400:
401: va_start(ap, m);
402: iphlen = va_arg(ap, int);
403: va_end(ap);
404:
405: tcpstat.tcps_rcvtotal++;
406:
407: opti.ts_present = 0;
408: opti.maxseg = 0;
409:
410: /*
411: * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
412: * See below for AF specific multicast.
413: */
414: if (m->m_flags & (M_BCAST|M_MCAST))
415: goto drop;
416:
417: /*
418: * Before we do ANYTHING, we have to figure out if it's TCP/IPv6 or
419: * TCP/IPv4.
420: */
421: switch (mtod(m, struct ip *)->ip_v) {
422: #ifdef INET6
423: case 6:
424: af = AF_INET6;
425: break;
426: #endif
427: case 4:
428: af = AF_INET;
429: break;
430: default:
431: m_freem(m);
432: return; /*EAFNOSUPPORT*/
433: }
434:
435: /*
436: * Get IP and TCP header together in first mbuf.
437: * Note: IP leaves IP header in first mbuf.
438: */
439: switch (af) {
440: case AF_INET:
441: #ifdef DIAGNOSTIC
442: if (iphlen < sizeof(struct ip)) {
443: m_freem(m);
444: return;
445: }
446: #endif /* DIAGNOSTIC */
447: break;
448: #ifdef INET6
449: case AF_INET6:
450: #ifdef DIAGNOSTIC
451: if (iphlen < sizeof(struct ip6_hdr)) {
452: m_freem(m);
453: return;
454: }
455: #endif /* DIAGNOSTIC */
456: break;
457: #endif
458: default:
459: m_freem(m);
460: return;
461: }
462:
463: IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th));
464: if (!th) {
465: tcpstat.tcps_rcvshort++;
466: return;
467: }
468:
469: tlen = m->m_pkthdr.len - iphlen;
470: ip = NULL;
471: #ifdef INET6
472: ip6 = NULL;
473: #endif
474: switch (af) {
475: case AF_INET:
476: ip = mtod(m, struct ip *);
477: if (IN_MULTICAST(ip->ip_dst.s_addr) ||
478: in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
479: goto drop;
480: #ifdef TCP_ECN
481: /* save ip_tos before clearing it for checksum */
482: iptos = ip->ip_tos;
483: #endif
484: /*
485: * Checksum extended TCP header and data.
486: */
487: if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) {
488: if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) {
489: tcpstat.tcps_inhwcsum++;
490: tcpstat.tcps_rcvbadsum++;
491: goto drop;
492: }
493: if (in4_cksum(m, IPPROTO_TCP, iphlen, tlen) != 0) {
494: tcpstat.tcps_rcvbadsum++;
495: goto drop;
496: }
497: } else {
498: m->m_pkthdr.csum_flags &= ~M_TCP_CSUM_IN_OK;
499: tcpstat.tcps_inhwcsum++;
500: }
501: break;
502: #ifdef INET6
503: case AF_INET6:
504: ip6 = mtod(m, struct ip6_hdr *);
505: #ifdef TCP_ECN
506: iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
507: #endif
508:
509: /* Be proactive about malicious use of IPv4 mapped address */
510: if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
511: IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
512: /* XXX stat */
513: goto drop;
514: }
515:
516: /*
517: * Be proactive about unspecified IPv6 address in source.
518: * As we use all-zero to indicate unbounded/unconnected pcb,
519: * unspecified IPv6 address can be used to confuse us.
520: *
521: * Note that packets with unspecified IPv6 destination is
522: * already dropped in ip6_input.
523: */
524: if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
525: /* XXX stat */
526: goto drop;
527: }
528:
529: /* Discard packets to multicast */
530: if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
531: /* XXX stat */
532: goto drop;
533: }
534:
535: /*
536: * Checksum extended TCP header and data.
537: */
538: if (in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), tlen)) {
539: tcpstat.tcps_rcvbadsum++;
540: goto drop;
541: }
542: break;
543: #endif
544: }
545:
546: /*
547: * Check that TCP offset makes sense,
548: * pull out TCP options and adjust length. XXX
549: */
550: off = th->th_off << 2;
551: if (off < sizeof(struct tcphdr) || off > tlen) {
552: tcpstat.tcps_rcvbadoff++;
553: goto drop;
554: }
555: tlen -= off;
556: if (off > sizeof(struct tcphdr)) {
557: IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off);
558: if (!th) {
559: tcpstat.tcps_rcvshort++;
560: return;
561: }
562: optlen = off - sizeof(struct tcphdr);
563: optp = (u_int8_t *)(th + 1);
564: /*
565: * Do quick retrieval of timestamp options ("options
566: * prediction?"). If timestamp is the only option and it's
567: * formatted as recommended in RFC 1323 appendix A, we
568: * quickly get the values now and not bother calling
569: * tcp_dooptions(), etc.
570: */
571: if ((optlen == TCPOLEN_TSTAMP_APPA ||
572: (optlen > TCPOLEN_TSTAMP_APPA &&
573: optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
574: *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
575: (th->th_flags & TH_SYN) == 0) {
576: opti.ts_present = 1;
577: opti.ts_val = ntohl(*(u_int32_t *)(optp + 4));
578: opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8));
579: optp = NULL; /* we've parsed the options */
580: }
581: }
582: tiflags = th->th_flags;
583:
584: /*
585: * Convert TCP protocol specific fields to host format.
586: */
587: NTOHL(th->th_seq);
588: NTOHL(th->th_ack);
589: NTOHS(th->th_win);
590: NTOHS(th->th_urp);
591:
592: /*
593: * Locate pcb for segment.
594: */
595: findpcb:
596: switch (af) {
597: #ifdef INET6
598: case AF_INET6:
599: inp = in6_pcbhashlookup(&tcbtable, &ip6->ip6_src, th->th_sport,
600: &ip6->ip6_dst, th->th_dport);
601: break;
602: #endif
603: case AF_INET:
604: inp = in_pcbhashlookup(&tcbtable, ip->ip_src, th->th_sport,
605: ip->ip_dst, th->th_dport);
606: break;
607: }
608: if (inp == 0) {
609: int inpl_flags = 0;
610: if (m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST)
611: inpl_flags = INPLOOKUP_WILDCARD;
612: ++tcpstat.tcps_pcbhashmiss;
613: switch (af) {
614: #ifdef INET6
615: case AF_INET6:
616: inp = in6_pcblookup_listen(&tcbtable,
617: &ip6->ip6_dst, th->th_dport, inpl_flags);
618: break;
619: #endif /* INET6 */
620: case AF_INET:
621: inp = in_pcblookup_listen(&tcbtable,
622: ip->ip_dst, th->th_dport, inpl_flags);
623: break;
624: }
625: /*
626: * If the state is CLOSED (i.e., TCB does not exist) then
627: * all data in the incoming segment is discarded.
628: * If the TCB exists but is in CLOSED state, it is embryonic,
629: * but should either do a listen or a connect soon.
630: */
631: if (inp == 0) {
632: ++tcpstat.tcps_noport;
633: goto dropwithreset_ratelim;
634: }
635: }
636:
637: /* Check the minimum TTL for socket. */
638: if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl)
639: goto drop;
640:
641: tp = intotcpcb(inp);
642: if (tp == 0)
643: goto dropwithreset_ratelim;
644: if (tp->t_state == TCPS_CLOSED)
645: goto drop;
646:
647: /* Unscale the window into a 32-bit value. */
648: if ((tiflags & TH_SYN) == 0)
649: tiwin = th->th_win << tp->snd_scale;
650: else
651: tiwin = th->th_win;
652:
653: so = inp->inp_socket;
654: if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
655: union syn_cache_sa src;
656: union syn_cache_sa dst;
657:
658: bzero(&src, sizeof(src));
659: bzero(&dst, sizeof(dst));
660: switch (af) {
661: #ifdef INET
662: case AF_INET:
663: src.sin.sin_len = sizeof(struct sockaddr_in);
664: src.sin.sin_family = AF_INET;
665: src.sin.sin_addr = ip->ip_src;
666: src.sin.sin_port = th->th_sport;
667:
668: dst.sin.sin_len = sizeof(struct sockaddr_in);
669: dst.sin.sin_family = AF_INET;
670: dst.sin.sin_addr = ip->ip_dst;
671: dst.sin.sin_port = th->th_dport;
672: break;
673: #endif
674: #ifdef INET6
675: case AF_INET6:
676: src.sin6.sin6_len = sizeof(struct sockaddr_in6);
677: src.sin6.sin6_family = AF_INET6;
678: src.sin6.sin6_addr = ip6->ip6_src;
679: src.sin6.sin6_port = th->th_sport;
680:
681: dst.sin6.sin6_len = sizeof(struct sockaddr_in6);
682: dst.sin6.sin6_family = AF_INET6;
683: dst.sin6.sin6_addr = ip6->ip6_dst;
684: dst.sin6.sin6_port = th->th_dport;
685: break;
686: #endif /* INET6 */
687: default:
688: goto badsyn; /*sanity*/
689: }
690:
691: if (so->so_options & SO_DEBUG) {
692: ostate = tp->t_state;
693: switch (af) {
694: #ifdef INET6
695: case AF_INET6:
696: bcopy(ip6, &tcp_saveti6.ti6_i, sizeof(*ip6));
697: bcopy(th, &tcp_saveti6.ti6_t, sizeof(*th));
698: break;
699: #endif
700: case AF_INET:
701: bcopy(ip, &tcp_saveti.ti_i, sizeof(*ip));
702: bcopy(th, &tcp_saveti.ti_t, sizeof(*th));
703: break;
704: }
705: }
706: if (so->so_options & SO_ACCEPTCONN) {
707: if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
708: if (tiflags & TH_RST) {
709: syn_cache_reset(&src.sa, &dst.sa, th);
710: } else if ((tiflags & (TH_ACK|TH_SYN)) ==
711: (TH_ACK|TH_SYN)) {
712: /*
713: * Received a SYN,ACK. This should
714: * never happen while we are in
715: * LISTEN. Send an RST.
716: */
717: goto badsyn;
718: } else if (tiflags & TH_ACK) {
719: so = syn_cache_get(&src.sa, &dst.sa,
720: th, iphlen, tlen, so, m);
721: if (so == NULL) {
722: /*
723: * We don't have a SYN for
724: * this ACK; send an RST.
725: */
726: goto badsyn;
727: } else if (so ==
728: (struct socket *)(-1)) {
729: /*
730: * We were unable to create
731: * the connection. If the
732: * 3-way handshake was
733: * completed, and RST has
734: * been sent to the peer.
735: * Since the mbuf might be
736: * in use for the reply,
737: * do not free it.
738: */
739: m = NULL;
740: } else {
741: /*
742: * We have created a
743: * full-blown connection.
744: */
745: tp = NULL;
746: inp = (struct inpcb *)so->so_pcb;
747: tp = intotcpcb(inp);
748: if (tp == NULL)
749: goto badsyn; /*XXX*/
750:
751: /*
752: * Compute proper scaling
753: * value from buffer space
754: */
755: tcp_rscale(tp, so->so_rcv.sb_hiwat);
756: goto after_listen;
757: }
758: } else {
759: /*
760: * None of RST, SYN or ACK was set.
761: * This is an invalid packet for a
762: * TCB in LISTEN state. Send a RST.
763: */
764: goto badsyn;
765: }
766: } else {
767: /*
768: * Received a SYN.
769: */
770: #ifdef INET6
771: /*
772: * If deprecated address is forbidden, we do
773: * not accept SYN to deprecated interface
774: * address to prevent any new inbound
775: * connection from getting established.
776: * When we do not accept SYN, we send a TCP
777: * RST, with deprecated source address (instead
778: * of dropping it). We compromise it as it is
779: * much better for peer to send a RST, and
780: * RST will be the final packet for the
781: * exchange.
782: *
783: * If we do not forbid deprecated addresses, we
784: * accept the SYN packet. RFC2462 does not
785: * suggest dropping SYN in this case.
786: * If we decipher RFC2462 5.5.4, it says like
787: * this:
788: * 1. use of deprecated addr with existing
789: * communication is okay - "SHOULD continue
790: * to be used"
791: * 2. use of it with new communication:
792: * (2a) "SHOULD NOT be used if alternate
793: * address with sufficient scope is
794: * available"
795: * (2b) nothing mentioned otherwise.
796: * Here we fall into (2b) case as we have no
797: * choice in our source address selection - we
798: * must obey the peer.
799: *
800: * The wording in RFC2462 is confusing, and
801: * there are multiple description text for
802: * deprecated address handling - worse, they
803: * are not exactly the same. I believe 5.5.4
804: * is the best one, so we follow 5.5.4.
805: */
806: if (ip6 && !ip6_use_deprecated) {
807: struct in6_ifaddr *ia6;
808:
809: if ((ia6 = in6ifa_ifpwithaddr(m->m_pkthdr.rcvif,
810: &ip6->ip6_dst)) &&
811: (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
812: tp = NULL;
813: goto dropwithreset;
814: }
815: }
816: #endif
817:
818: /*
819: * LISTEN socket received a SYN
820: * from itself? This can't possibly
821: * be valid; drop the packet.
822: */
823: if (th->th_dport == th->th_sport) {
824: switch (af) {
825: #ifdef INET6
826: case AF_INET6:
827: if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src,
828: &ip6->ip6_dst)) {
829: tcpstat.tcps_badsyn++;
830: goto drop;
831: }
832: break;
833: #endif /* INET6 */
834: case AF_INET:
835: if (ip->ip_dst.s_addr == ip->ip_src.s_addr) {
836: tcpstat.tcps_badsyn++;
837: goto drop;
838: }
839: break;
840: }
841: }
842:
843: /*
844: * SYN looks ok; create compressed TCP
845: * state for it.
846: */
847: if (so->so_qlen <= so->so_qlimit &&
848: syn_cache_add(&src.sa, &dst.sa, th, iphlen,
849: so, m, optp, optlen, &opti, reuse))
850: m = NULL;
851: }
852: goto drop;
853: }
854: }
855:
856: after_listen:
857: #ifdef DIAGNOSTIC
858: /*
859: * Should not happen now that all embryonic connections
860: * are handled with compressed state.
861: */
862: if (tp->t_state == TCPS_LISTEN)
863: panic("tcp_input: TCPS_LISTEN");
864: #endif
865:
866: #ifdef IPSEC
867: /* Find most recent IPsec tag */
868: mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
869: s = splnet();
870: if (mtag != NULL) {
871: tdbi = (struct tdb_ident *)(mtag + 1);
872: tdb = gettdb(tdbi->spi, &tdbi->dst, tdbi->proto);
873: } else
874: tdb = NULL;
875: ipsp_spd_lookup(m, af, iphlen, &error, IPSP_DIRECTION_IN,
876: tdb, inp);
877: if (error) {
878: splx(s);
879: goto drop;
880: }
881:
882: /* Latch SA */
883: if (inp->inp_tdb_in != tdb) {
884: if (tdb) {
885: tdb_add_inp(tdb, inp, 1);
886: if (inp->inp_ipo == NULL) {
887: inp->inp_ipo = ipsec_add_policy(inp, af,
888: IPSP_DIRECTION_OUT);
889: if (inp->inp_ipo == NULL) {
890: splx(s);
891: goto drop;
892: }
893: }
894: if (inp->inp_ipo->ipo_dstid == NULL &&
895: tdb->tdb_srcid != NULL) {
896: inp->inp_ipo->ipo_dstid = tdb->tdb_srcid;
897: tdb->tdb_srcid->ref_count++;
898: }
899: if (inp->inp_ipsec_remotecred == NULL &&
900: tdb->tdb_remote_cred != NULL) {
901: inp->inp_ipsec_remotecred =
902: tdb->tdb_remote_cred;
903: tdb->tdb_remote_cred->ref_count++;
904: }
905: if (inp->inp_ipsec_remoteauth == NULL &&
906: tdb->tdb_remote_auth != NULL) {
907: inp->inp_ipsec_remoteauth =
908: tdb->tdb_remote_auth;
909: tdb->tdb_remote_auth->ref_count++;
910: }
911: } else { /* Just reset */
912: TAILQ_REMOVE(&inp->inp_tdb_in->tdb_inp_in, inp,
913: inp_tdb_in_next);
914: inp->inp_tdb_in = NULL;
915: }
916: }
917: splx(s);
918: #endif /* IPSEC */
919:
920: /*
921: * Segment received on connection.
922: * Reset idle time and keep-alive timer.
923: */
924: tp->t_rcvtime = tcp_now;
925: if (TCPS_HAVEESTABLISHED(tp->t_state))
926: TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
927:
928: #ifdef TCP_SACK
929: if (tp->sack_enable)
930: tcp_del_sackholes(tp, th); /* Delete stale SACK holes */
931: #endif /* TCP_SACK */
932:
933: /*
934: * Process options.
935: */
936: #ifdef TCP_SIGNATURE
937: if (optp || (tp->t_flags & TF_SIGNATURE))
938: #else
939: if (optp)
940: #endif
941: if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti))
942: goto drop;
943:
944: if (opti.ts_present && opti.ts_ecr) {
945: int rtt_test;
946:
947: /* subtract out the tcp timestamp modulator */
948: opti.ts_ecr -= tp->ts_modulate;
949:
950: /* make sure ts_ecr is sensible */
951: rtt_test = tcp_now - opti.ts_ecr;
952: if (rtt_test < 0 || rtt_test > TCP_RTT_MAX)
953: opti.ts_ecr = 0;
954: }
955:
956: #ifdef TCP_ECN
957: /* if congestion experienced, set ECE bit in subsequent packets. */
958: if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) {
959: tp->t_flags |= TF_RCVD_CE;
960: tcpstat.tcps_ecn_rcvce++;
961: }
962: #endif
963: /*
964: * Header prediction: check for the two common cases
965: * of a uni-directional data xfer. If the packet has
966: * no control flags, is in-sequence, the window didn't
967: * change and we're not retransmitting, it's a
968: * candidate. If the length is zero and the ack moved
969: * forward, we're the sender side of the xfer. Just
970: * free the data acked & wake any higher level process
971: * that was blocked waiting for space. If the length
972: * is non-zero and the ack didn't move, we're the
973: * receiver side. If we're getting packets in-order
974: * (the reassembly queue is empty), add the data to
975: * the socket buffer and note that we need a delayed ack.
976: */
977: if (tp->t_state == TCPS_ESTABLISHED &&
978: #ifdef TCP_ECN
979: (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK &&
980: #else
981: (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
982: #endif
983: (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) &&
984: th->th_seq == tp->rcv_nxt &&
985: tiwin && tiwin == tp->snd_wnd &&
986: tp->snd_nxt == tp->snd_max) {
987:
988: /*
989: * If last ACK falls within this segment's sequence numbers,
990: * record the timestamp.
991: * Fix from Braden, see Stevens p. 870
992: */
993: if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
994: tp->ts_recent_age = tcp_now;
995: tp->ts_recent = opti.ts_val;
996: }
997:
998: if (tlen == 0) {
999: if (SEQ_GT(th->th_ack, tp->snd_una) &&
1000: SEQ_LEQ(th->th_ack, tp->snd_max) &&
1001: tp->snd_cwnd >= tp->snd_wnd &&
1002: tp->t_dupacks == 0) {
1003: /*
1004: * this is a pure ack for outstanding data.
1005: */
1006: ++tcpstat.tcps_predack;
1007: if (opti.ts_present && opti.ts_ecr)
1008: tcp_xmit_timer(tp, tcp_now - opti.ts_ecr);
1009: else if (tp->t_rtttime &&
1010: SEQ_GT(th->th_ack, tp->t_rtseq))
1011: tcp_xmit_timer(tp,
1012: tcp_now - tp->t_rtttime);
1013: acked = th->th_ack - tp->snd_una;
1014: tcpstat.tcps_rcvackpack++;
1015: tcpstat.tcps_rcvackbyte += acked;
1016: ND6_HINT(tp);
1017: sbdrop(&so->so_snd, acked);
1018:
1019: /*
1020: * If we had a pending ICMP message that
1021: * referres to data that have just been
1022: * acknowledged, disregard the recorded ICMP
1023: * message.
1024: */
1025: if ((tp->t_flags & TF_PMTUD_PEND) &&
1026: SEQ_GT(th->th_ack, tp->t_pmtud_th_seq))
1027: tp->t_flags &= ~TF_PMTUD_PEND;
1028:
1029: /*
1030: * Keep track of the largest chunk of data
1031: * acknowledged since last PMTU update
1032: */
1033: if (tp->t_pmtud_mss_acked < acked)
1034: tp->t_pmtud_mss_acked = acked;
1035:
1036: tp->snd_una = th->th_ack;
1037: #if defined(TCP_SACK) || defined(TCP_ECN)
1038: /*
1039: * We want snd_last to track snd_una so
1040: * as to avoid sequence wraparound problems
1041: * for very large transfers.
1042: */
1043: #ifdef TCP_ECN
1044: if (SEQ_GT(tp->snd_una, tp->snd_last))
1045: #endif
1046: tp->snd_last = tp->snd_una;
1047: #endif /* TCP_SACK */
1048: #if defined(TCP_SACK) && defined(TCP_FACK)
1049: tp->snd_fack = tp->snd_una;
1050: tp->retran_data = 0;
1051: #endif /* TCP_FACK */
1052: m_freem(m);
1053:
1054: /*
1055: * If all outstanding data are acked, stop
1056: * retransmit timer, otherwise restart timer
1057: * using current (possibly backed-off) value.
1058: * If process is waiting for space,
1059: * wakeup/selwakeup/signal. If data
1060: * are ready to send, let tcp_output
1061: * decide between more output or persist.
1062: */
1063: if (tp->snd_una == tp->snd_max)
1064: TCP_TIMER_DISARM(tp, TCPT_REXMT);
1065: else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
1066: TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
1067:
1068: if (sb_notify(&so->so_snd))
1069: sowwakeup(so);
1070: if (so->so_snd.sb_cc)
1071: (void) tcp_output(tp);
1072: return;
1073: }
1074: } else if (th->th_ack == tp->snd_una &&
1075: TAILQ_EMPTY(&tp->t_segq) &&
1076: tlen <= sbspace(&so->so_rcv)) {
1077: /*
1078: * This is a pure, in-sequence data packet
1079: * with nothing on the reassembly queue and
1080: * we have enough buffer space to take it.
1081: */
1082: #ifdef TCP_SACK
1083: /* Clean receiver SACK report if present */
1084: if (tp->sack_enable && tp->rcv_numsacks)
1085: tcp_clean_sackreport(tp);
1086: #endif /* TCP_SACK */
1087: ++tcpstat.tcps_preddat;
1088: tp->rcv_nxt += tlen;
1089: tcpstat.tcps_rcvpack++;
1090: tcpstat.tcps_rcvbyte += tlen;
1091: ND6_HINT(tp);
1092: /*
1093: * Drop TCP, IP headers and TCP options then add data
1094: * to socket buffer.
1095: */
1096: if (so->so_state & SS_CANTRCVMORE)
1097: m_freem(m);
1098: else {
1099: m_adj(m, iphlen + off);
1100: sbappendstream(&so->so_rcv, m);
1101: }
1102: sorwakeup(so);
1103: TCP_SETUP_ACK(tp, tiflags);
1104: if (tp->t_flags & TF_ACKNOW)
1105: (void) tcp_output(tp);
1106: return;
1107: }
1108: }
1109:
1110: /*
1111: * Compute mbuf offset to TCP data segment.
1112: */
1113: hdroptlen = iphlen + off;
1114:
1115: /*
1116: * Calculate amount of space in receive window,
1117: * and then do TCP input processing.
1118: * Receive window is amount of space in rcv queue,
1119: * but not less than advertised window.
1120: */
1121: { int win;
1122:
1123: win = sbspace(&so->so_rcv);
1124: if (win < 0)
1125: win = 0;
1126: tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
1127: }
1128:
1129: switch (tp->t_state) {
1130:
1131: /*
1132: * If the state is SYN_RECEIVED:
1133: * if seg contains SYN/ACK, send an RST.
1134: * if seg contains an ACK, but not for our SYN/ACK, send an RST
1135: */
1136:
1137: case TCPS_SYN_RECEIVED:
1138: if (tiflags & TH_ACK) {
1139: if (tiflags & TH_SYN) {
1140: tcpstat.tcps_badsyn++;
1141: goto dropwithreset;
1142: }
1143: if (SEQ_LEQ(th->th_ack, tp->snd_una) ||
1144: SEQ_GT(th->th_ack, tp->snd_max))
1145: goto dropwithreset;
1146: }
1147: break;
1148:
1149: /*
1150: * If the state is SYN_SENT:
1151: * if seg contains an ACK, but not for our SYN, drop the input.
1152: * if seg contains a RST, then drop the connection.
1153: * if seg does not contain SYN, then drop it.
1154: * Otherwise this is an acceptable SYN segment
1155: * initialize tp->rcv_nxt and tp->irs
1156: * if seg contains ack then advance tp->snd_una
1157: * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
1158: * arrange for segment to be acked (eventually)
1159: * continue processing rest of data/controls, beginning with URG
1160: */
1161: case TCPS_SYN_SENT:
1162: if ((tiflags & TH_ACK) &&
1163: (SEQ_LEQ(th->th_ack, tp->iss) ||
1164: SEQ_GT(th->th_ack, tp->snd_max)))
1165: goto dropwithreset;
1166: if (tiflags & TH_RST) {
1167: #ifdef TCP_ECN
1168: /* if ECN is enabled, fall back to non-ecn at rexmit */
1169: if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN))
1170: goto drop;
1171: #endif
1172: if (tiflags & TH_ACK)
1173: tp = tcp_drop(tp, ECONNREFUSED);
1174: goto drop;
1175: }
1176: if ((tiflags & TH_SYN) == 0)
1177: goto drop;
1178: if (tiflags & TH_ACK) {
1179: tp->snd_una = th->th_ack;
1180: if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1181: tp->snd_nxt = tp->snd_una;
1182: }
1183: TCP_TIMER_DISARM(tp, TCPT_REXMT);
1184: tp->irs = th->th_seq;
1185: tcp_mss(tp, opti.maxseg);
1186: /* Reset initial window to 1 segment for retransmit */
1187: if (tp->t_rxtshift > 0)
1188: tp->snd_cwnd = tp->t_maxseg;
1189: tcp_rcvseqinit(tp);
1190: tp->t_flags |= TF_ACKNOW;
1191: #ifdef TCP_SACK
1192: /*
1193: * If we've sent a SACK_PERMITTED option, and the peer
1194: * also replied with one, then TF_SACK_PERMIT should have
1195: * been set in tcp_dooptions(). If it was not, disable SACKs.
1196: */
1197: if (tp->sack_enable)
1198: tp->sack_enable = tp->t_flags & TF_SACK_PERMIT;
1199: #endif
1200: #ifdef TCP_ECN
1201: /*
1202: * if ECE is set but CWR is not set for SYN-ACK, or
1203: * both ECE and CWR are set for simultaneous open,
1204: * peer is ECN capable.
1205: */
1206: if (tcp_do_ecn) {
1207: if ((tiflags & (TH_ACK|TH_ECE|TH_CWR))
1208: == (TH_ACK|TH_ECE) ||
1209: (tiflags & (TH_ACK|TH_ECE|TH_CWR))
1210: == (TH_ECE|TH_CWR)) {
1211: tp->t_flags |= TF_ECN_PERMIT;
1212: tiflags &= ~(TH_ECE|TH_CWR);
1213: tcpstat.tcps_ecn_accepts++;
1214: }
1215: }
1216: #endif
1217:
1218: if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) {
1219: tcpstat.tcps_connects++;
1220: soisconnected(so);
1221: tp->t_state = TCPS_ESTABLISHED;
1222: TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
1223: /* Do window scaling on this connection? */
1224: if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1225: (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1226: tp->snd_scale = tp->requested_s_scale;
1227: tp->rcv_scale = tp->request_r_scale;
1228: }
1229: tcp_reass_lock(tp);
1230: (void) tcp_reass(tp, (struct tcphdr *)0,
1231: (struct mbuf *)0, &tlen);
1232: tcp_reass_unlock(tp);
1233: /*
1234: * if we didn't have to retransmit the SYN,
1235: * use its rtt as our initial srtt & rtt var.
1236: */
1237: if (tp->t_rtttime)
1238: tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
1239: /*
1240: * Since new data was acked (the SYN), open the
1241: * congestion window by one MSS. We do this
1242: * here, because we won't go through the normal
1243: * ACK processing below. And since this is the
1244: * start of the connection, we know we are in
1245: * the exponential phase of slow-start.
1246: */
1247: tp->snd_cwnd += tp->t_maxseg;
1248: } else
1249: tp->t_state = TCPS_SYN_RECEIVED;
1250:
1251: #if 0
1252: trimthenstep6:
1253: #endif
1254: /*
1255: * Advance th->th_seq to correspond to first data byte.
1256: * If data, trim to stay within window,
1257: * dropping FIN if necessary.
1258: */
1259: th->th_seq++;
1260: if (tlen > tp->rcv_wnd) {
1261: todrop = tlen - tp->rcv_wnd;
1262: m_adj(m, -todrop);
1263: tlen = tp->rcv_wnd;
1264: tiflags &= ~TH_FIN;
1265: tcpstat.tcps_rcvpackafterwin++;
1266: tcpstat.tcps_rcvbyteafterwin += todrop;
1267: }
1268: tp->snd_wl1 = th->th_seq - 1;
1269: tp->rcv_up = th->th_seq;
1270: goto step6;
1271: /*
1272: * If a new connection request is received while in TIME_WAIT,
1273: * drop the old connection and start over if the if the
1274: * timestamp or the sequence numbers are above the previous
1275: * ones.
1276: */
1277: case TCPS_TIME_WAIT:
1278: if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) &&
1279: ((opti.ts_present &&
1280: TSTMP_LT(tp->ts_recent, opti.ts_val)) ||
1281: SEQ_GT(th->th_seq, tp->rcv_nxt))) {
1282: /*
1283: * Advance the iss by at least 32768, but
1284: * clear the msb in order to make sure
1285: * that SEG_LT(snd_nxt, iss).
1286: */
1287: iss = tp->snd_nxt +
1288: ((arc4random() & 0x7fffffff) | 0x8000);
1289: reuse = &iss;
1290: tp = tcp_close(tp);
1291: goto findpcb;
1292: }
1293: }
1294:
1295: /*
1296: * States other than LISTEN or SYN_SENT.
1297: * First check timestamp, if present.
1298: * Then check that at least some bytes of segment are within
1299: * receive window. If segment begins before rcv_nxt,
1300: * drop leading data (and SYN); if nothing left, just ack.
1301: *
1302: * RFC 1323 PAWS: If we have a timestamp reply on this segment
1303: * and it's less than opti.ts_recent, drop it.
1304: */
1305: if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent &&
1306: TSTMP_LT(opti.ts_val, tp->ts_recent)) {
1307:
1308: /* Check to see if ts_recent is over 24 days old. */
1309: if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
1310: /*
1311: * Invalidate ts_recent. If this segment updates
1312: * ts_recent, the age will be reset later and ts_recent
1313: * will get a valid value. If it does not, setting
1314: * ts_recent to zero will at least satisfy the
1315: * requirement that zero be placed in the timestamp
1316: * echo reply when ts_recent isn't valid. The
1317: * age isn't reset until we get a valid ts_recent
1318: * because we don't want out-of-order segments to be
1319: * dropped when ts_recent is old.
1320: */
1321: tp->ts_recent = 0;
1322: } else {
1323: tcpstat.tcps_rcvduppack++;
1324: tcpstat.tcps_rcvdupbyte += tlen;
1325: tcpstat.tcps_pawsdrop++;
1326: goto dropafterack;
1327: }
1328: }
1329:
1330: todrop = tp->rcv_nxt - th->th_seq;
1331: if (todrop > 0) {
1332: if (tiflags & TH_SYN) {
1333: tiflags &= ~TH_SYN;
1334: th->th_seq++;
1335: if (th->th_urp > 1)
1336: th->th_urp--;
1337: else
1338: tiflags &= ~TH_URG;
1339: todrop--;
1340: }
1341: if (todrop > tlen ||
1342: (todrop == tlen && (tiflags & TH_FIN) == 0)) {
1343: /*
1344: * Any valid FIN must be to the left of the
1345: * window. At this point, FIN must be a
1346: * duplicate or out-of-sequence, so drop it.
1347: */
1348: tiflags &= ~TH_FIN;
1349: /*
1350: * Send ACK to resynchronize, and drop any data,
1351: * but keep on processing for RST or ACK.
1352: */
1353: tp->t_flags |= TF_ACKNOW;
1354: tcpstat.tcps_rcvdupbyte += todrop = tlen;
1355: tcpstat.tcps_rcvduppack++;
1356: } else {
1357: tcpstat.tcps_rcvpartduppack++;
1358: tcpstat.tcps_rcvpartdupbyte += todrop;
1359: }
1360: hdroptlen += todrop; /* drop from head afterwards */
1361: th->th_seq += todrop;
1362: tlen -= todrop;
1363: if (th->th_urp > todrop)
1364: th->th_urp -= todrop;
1365: else {
1366: tiflags &= ~TH_URG;
1367: th->th_urp = 0;
1368: }
1369: }
1370:
1371: /*
1372: * If new data are received on a connection after the
1373: * user processes are gone, then RST the other end.
1374: */
1375: if ((so->so_state & SS_NOFDREF) &&
1376: tp->t_state > TCPS_CLOSE_WAIT && tlen) {
1377: tp = tcp_close(tp);
1378: tcpstat.tcps_rcvafterclose++;
1379: goto dropwithreset;
1380: }
1381:
1382: /*
1383: * If segment ends after window, drop trailing data
1384: * (and PUSH and FIN); if nothing left, just ACK.
1385: */
1386: todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd);
1387: if (todrop > 0) {
1388: tcpstat.tcps_rcvpackafterwin++;
1389: if (todrop >= tlen) {
1390: tcpstat.tcps_rcvbyteafterwin += tlen;
1391: /*
1392: * If window is closed can only take segments at
1393: * window edge, and have to drop data and PUSH from
1394: * incoming segments. Continue processing, but
1395: * remember to ack. Otherwise, drop segment
1396: * and ack.
1397: */
1398: if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
1399: tp->t_flags |= TF_ACKNOW;
1400: tcpstat.tcps_rcvwinprobe++;
1401: } else
1402: goto dropafterack;
1403: } else
1404: tcpstat.tcps_rcvbyteafterwin += todrop;
1405: m_adj(m, -todrop);
1406: tlen -= todrop;
1407: tiflags &= ~(TH_PUSH|TH_FIN);
1408: }
1409:
1410: /*
1411: * If last ACK falls within this segment's sequence numbers,
1412: * record its timestamp if it's more recent.
1413: * Cf fix from Braden, see Stevens p. 870
1414: */
1415: if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) &&
1416: SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
1417: if (SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
1418: ((tiflags & (TH_SYN|TH_FIN)) != 0)))
1419: tp->ts_recent = opti.ts_val;
1420: else
1421: tp->ts_recent = 0;
1422: tp->ts_recent_age = tcp_now;
1423: }
1424:
1425: /*
1426: * If the RST bit is set examine the state:
1427: * SYN_RECEIVED STATE:
1428: * If passive open, return to LISTEN state.
1429: * If active open, inform user that connection was refused.
1430: * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
1431: * Inform user that connection was reset, and close tcb.
1432: * CLOSING, LAST_ACK, TIME_WAIT STATES
1433: * Close the tcb.
1434: */
1435: if (tiflags & TH_RST) {
1436: if (th->th_seq != tp->last_ack_sent &&
1437: th->th_seq != tp->rcv_nxt &&
1438: th->th_seq != (tp->rcv_nxt + 1))
1439: goto drop;
1440:
1441: switch (tp->t_state) {
1442: case TCPS_SYN_RECEIVED:
1443: #ifdef TCP_ECN
1444: /* if ECN is enabled, fall back to non-ecn at rexmit */
1445: if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN))
1446: goto drop;
1447: #endif
1448: so->so_error = ECONNREFUSED;
1449: goto close;
1450:
1451: case TCPS_ESTABLISHED:
1452: case TCPS_FIN_WAIT_1:
1453: case TCPS_FIN_WAIT_2:
1454: case TCPS_CLOSE_WAIT:
1455: so->so_error = ECONNRESET;
1456: close:
1457: tp->t_state = TCPS_CLOSED;
1458: tcpstat.tcps_drops++;
1459: tp = tcp_close(tp);
1460: goto drop;
1461: case TCPS_CLOSING:
1462: case TCPS_LAST_ACK:
1463: case TCPS_TIME_WAIT:
1464: tp = tcp_close(tp);
1465: goto drop;
1466: }
1467: }
1468:
1469: /*
1470: * If a SYN is in the window, then this is an
1471: * error and we ACK and drop the packet.
1472: */
1473: if (tiflags & TH_SYN)
1474: goto dropafterack_ratelim;
1475:
1476: /*
1477: * If the ACK bit is off we drop the segment and return.
1478: */
1479: if ((tiflags & TH_ACK) == 0) {
1480: if (tp->t_flags & TF_ACKNOW)
1481: goto dropafterack;
1482: else
1483: goto drop;
1484: }
1485:
1486: /*
1487: * Ack processing.
1488: */
1489: switch (tp->t_state) {
1490:
1491: /*
1492: * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
1493: * ESTABLISHED state and continue processing.
1494: * The ACK was checked above.
1495: */
1496: case TCPS_SYN_RECEIVED:
1497: tcpstat.tcps_connects++;
1498: soisconnected(so);
1499: tp->t_state = TCPS_ESTABLISHED;
1500: TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
1501: /* Do window scaling? */
1502: if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1503: (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1504: tp->snd_scale = tp->requested_s_scale;
1505: tp->rcv_scale = tp->request_r_scale;
1506: }
1507: tcp_reass_lock(tp);
1508: (void) tcp_reass(tp, (struct tcphdr *)0, (struct mbuf *)0,
1509: &tlen);
1510: tcp_reass_unlock(tp);
1511: tp->snd_wl1 = th->th_seq - 1;
1512: /* fall into ... */
1513:
1514: /*
1515: * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
1516: * ACKs. If the ack is in the range
1517: * tp->snd_una < th->th_ack <= tp->snd_max
1518: * then advance tp->snd_una to th->th_ack and drop
1519: * data from the retransmission queue. If this ACK reflects
1520: * more up to date window information we update our window information.
1521: */
1522: case TCPS_ESTABLISHED:
1523: case TCPS_FIN_WAIT_1:
1524: case TCPS_FIN_WAIT_2:
1525: case TCPS_CLOSE_WAIT:
1526: case TCPS_CLOSING:
1527: case TCPS_LAST_ACK:
1528: case TCPS_TIME_WAIT:
1529: #ifdef TCP_ECN
1530: /*
1531: * if we receive ECE and are not already in recovery phase,
1532: * reduce cwnd by half but don't slow-start.
1533: * advance snd_last to snd_max not to reduce cwnd again
1534: * until all outstanding packets are acked.
1535: */
1536: if (tcp_do_ecn && (tiflags & TH_ECE)) {
1537: if ((tp->t_flags & TF_ECN_PERMIT) &&
1538: SEQ_GEQ(tp->snd_una, tp->snd_last)) {
1539: u_int win;
1540:
1541: win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg;
1542: if (win > 1) {
1543: tp->snd_ssthresh = win / 2 * tp->t_maxseg;
1544: tp->snd_cwnd = tp->snd_ssthresh;
1545: tp->snd_last = tp->snd_max;
1546: tp->t_flags |= TF_SEND_CWR;
1547: tcpstat.tcps_cwr_ecn++;
1548: }
1549: }
1550: tcpstat.tcps_ecn_rcvece++;
1551: }
1552: /*
1553: * if we receive CWR, we know that the peer has reduced
1554: * its congestion window. stop sending ecn-echo.
1555: */
1556: if ((tiflags & TH_CWR)) {
1557: tp->t_flags &= ~TF_RCVD_CE;
1558: tcpstat.tcps_ecn_rcvcwr++;
1559: }
1560: #endif /* TCP_ECN */
1561:
1562: if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
1563: /*
1564: * Duplicate/old ACK processing.
1565: * Increments t_dupacks:
1566: * Pure duplicate (same seq/ack/window, no data)
1567: * Doesn't affect t_dupacks:
1568: * Data packets.
1569: * Normal window updates (window opens)
1570: * Resets t_dupacks:
1571: * New data ACKed.
1572: * Window shrinks
1573: * Old ACK
1574: */
1575: if (tlen) {
1576: /* Drop very old ACKs unless th_seq matches */
1577: if (th->th_seq != tp->rcv_nxt &&
1578: SEQ_LT(th->th_ack,
1579: tp->snd_una - tp->max_sndwnd)) {
1580: tcpstat.tcps_rcvacktooold++;
1581: goto drop;
1582: }
1583: break;
1584: }
1585: /*
1586: * If we get an old ACK, there is probably packet
1587: * reordering going on. Be conservative and reset
1588: * t_dupacks so that we are less agressive in
1589: * doing a fast retransmit.
1590: */
1591: if (th->th_ack != tp->snd_una) {
1592: tp->t_dupacks = 0;
1593: break;
1594: }
1595: if (tiwin == tp->snd_wnd) {
1596: tcpstat.tcps_rcvdupack++;
1597: /*
1598: * If we have outstanding data (other than
1599: * a window probe), this is a completely
1600: * duplicate ack (ie, window info didn't
1601: * change), the ack is the biggest we've
1602: * seen and we've seen exactly our rexmt
1603: * threshold of them, assume a packet
1604: * has been dropped and retransmit it.
1605: * Kludge snd_nxt & the congestion
1606: * window so we send only this one
1607: * packet.
1608: *
1609: * We know we're losing at the current
1610: * window size so do congestion avoidance
1611: * (set ssthresh to half the current window
1612: * and pull our congestion window back to
1613: * the new ssthresh).
1614: *
1615: * Dup acks mean that packets have left the
1616: * network (they're now cached at the receiver)
1617: * so bump cwnd by the amount in the receiver
1618: * to keep a constant cwnd packets in the
1619: * network.
1620: */
1621: if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0)
1622: tp->t_dupacks = 0;
1623: #if defined(TCP_SACK) && defined(TCP_FACK)
1624: /*
1625: * In FACK, can enter fast rec. if the receiver
1626: * reports a reass. queue longer than 3 segs.
1627: */
1628: else if (++tp->t_dupacks == tcprexmtthresh ||
1629: ((SEQ_GT(tp->snd_fack, tcprexmtthresh *
1630: tp->t_maxseg + tp->snd_una)) &&
1631: SEQ_GT(tp->snd_una, tp->snd_last))) {
1632: #else
1633: else if (++tp->t_dupacks == tcprexmtthresh) {
1634: #endif /* TCP_FACK */
1635: tcp_seq onxt = tp->snd_nxt;
1636: u_long win =
1637: ulmin(tp->snd_wnd, tp->snd_cwnd) /
1638: 2 / tp->t_maxseg;
1639:
1640: #if defined(TCP_SACK) || defined(TCP_ECN)
1641: if (SEQ_LT(th->th_ack, tp->snd_last)){
1642: /*
1643: * False fast retx after
1644: * timeout. Do not cut window.
1645: */
1646: tp->t_dupacks = 0;
1647: goto drop;
1648: }
1649: #endif
1650: if (win < 2)
1651: win = 2;
1652: tp->snd_ssthresh = win * tp->t_maxseg;
1653: #if defined(TCP_SACK)
1654: tp->snd_last = tp->snd_max;
1655: #endif
1656: #ifdef TCP_SACK
1657: if (tp->sack_enable) {
1658: TCP_TIMER_DISARM(tp, TCPT_REXMT);
1659: tp->t_rtttime = 0;
1660: #ifdef TCP_ECN
1661: tp->t_flags |= TF_SEND_CWR;
1662: #endif
1663: #if 1 /* TCP_ECN */
1664: tcpstat.tcps_cwr_frecovery++;
1665: #endif
1666: tcpstat.tcps_sack_recovery_episode++;
1667: #if defined(TCP_SACK) && defined(TCP_FACK)
1668: tp->t_dupacks = tcprexmtthresh;
1669: (void) tcp_output(tp);
1670: /*
1671: * During FR, snd_cwnd is held
1672: * constant for FACK.
1673: */
1674: tp->snd_cwnd = tp->snd_ssthresh;
1675: #else
1676: /*
1677: * tcp_output() will send
1678: * oldest SACK-eligible rtx.
1679: */
1680: (void) tcp_output(tp);
1681: tp->snd_cwnd = tp->snd_ssthresh+
1682: tp->t_maxseg * tp->t_dupacks;
1683: #endif /* TCP_FACK */
1684: goto drop;
1685: }
1686: #endif /* TCP_SACK */
1687: TCP_TIMER_DISARM(tp, TCPT_REXMT);
1688: tp->t_rtttime = 0;
1689: tp->snd_nxt = th->th_ack;
1690: tp->snd_cwnd = tp->t_maxseg;
1691: #ifdef TCP_ECN
1692: tp->t_flags |= TF_SEND_CWR;
1693: #endif
1694: #if 1 /* TCP_ECN */
1695: tcpstat.tcps_cwr_frecovery++;
1696: #endif
1697: tcpstat.tcps_sndrexmitfast++;
1698: (void) tcp_output(tp);
1699:
1700: tp->snd_cwnd = tp->snd_ssthresh +
1701: tp->t_maxseg * tp->t_dupacks;
1702: if (SEQ_GT(onxt, tp->snd_nxt))
1703: tp->snd_nxt = onxt;
1704: goto drop;
1705: } else if (tp->t_dupacks > tcprexmtthresh) {
1706: #if defined(TCP_SACK) && defined(TCP_FACK)
1707: /*
1708: * while (awnd < cwnd)
1709: * sendsomething();
1710: */
1711: if (tp->sack_enable) {
1712: if (tp->snd_awnd < tp->snd_cwnd)
1713: tcp_output(tp);
1714: goto drop;
1715: }
1716: #endif /* TCP_FACK */
1717: tp->snd_cwnd += tp->t_maxseg;
1718: (void) tcp_output(tp);
1719: goto drop;
1720: }
1721: } else if (tiwin < tp->snd_wnd) {
1722: /*
1723: * The window was retracted! Previous dup
1724: * ACKs may have been due to packets arriving
1725: * after the shrunken window, not a missing
1726: * packet, so play it safe and reset t_dupacks
1727: */
1728: tp->t_dupacks = 0;
1729: }
1730: break;
1731: }
1732: /*
1733: * If the congestion window was inflated to account
1734: * for the other side's cached packets, retract it.
1735: */
1736: #if defined(TCP_SACK)
1737: if (tp->sack_enable) {
1738: if (tp->t_dupacks >= tcprexmtthresh) {
1739: /* Check for a partial ACK */
1740: if (tcp_sack_partialack(tp, th)) {
1741: #if defined(TCP_SACK) && defined(TCP_FACK)
1742: /* Force call to tcp_output */
1743: if (tp->snd_awnd < tp->snd_cwnd)
1744: needoutput = 1;
1745: #else
1746: tp->snd_cwnd += tp->t_maxseg;
1747: needoutput = 1;
1748: #endif /* TCP_FACK */
1749: } else {
1750: /* Out of fast recovery */
1751: tp->snd_cwnd = tp->snd_ssthresh;
1752: if (tcp_seq_subtract(tp->snd_max,
1753: th->th_ack) < tp->snd_ssthresh)
1754: tp->snd_cwnd =
1755: tcp_seq_subtract(tp->snd_max,
1756: th->th_ack);
1757: tp->t_dupacks = 0;
1758: #if defined(TCP_SACK) && defined(TCP_FACK)
1759: if (SEQ_GT(th->th_ack, tp->snd_fack))
1760: tp->snd_fack = th->th_ack;
1761: #endif /* TCP_FACK */
1762: }
1763: }
1764: } else {
1765: if (tp->t_dupacks >= tcprexmtthresh &&
1766: !tcp_newreno(tp, th)) {
1767: /* Out of fast recovery */
1768: tp->snd_cwnd = tp->snd_ssthresh;
1769: if (tcp_seq_subtract(tp->snd_max, th->th_ack) <
1770: tp->snd_ssthresh)
1771: tp->snd_cwnd =
1772: tcp_seq_subtract(tp->snd_max,
1773: th->th_ack);
1774: tp->t_dupacks = 0;
1775: }
1776: }
1777: if (tp->t_dupacks < tcprexmtthresh)
1778: tp->t_dupacks = 0;
1779: #else /* else no TCP_SACK */
1780: if (tp->t_dupacks >= tcprexmtthresh &&
1781: tp->snd_cwnd > tp->snd_ssthresh)
1782: tp->snd_cwnd = tp->snd_ssthresh;
1783: tp->t_dupacks = 0;
1784: #endif
1785: if (SEQ_GT(th->th_ack, tp->snd_max)) {
1786: tcpstat.tcps_rcvacktoomuch++;
1787: goto dropafterack_ratelim;
1788: }
1789: acked = th->th_ack - tp->snd_una;
1790: tcpstat.tcps_rcvackpack++;
1791: tcpstat.tcps_rcvackbyte += acked;
1792:
1793: /*
1794: * If we have a timestamp reply, update smoothed
1795: * round trip time. If no timestamp is present but
1796: * transmit timer is running and timed sequence
1797: * number was acked, update smoothed round trip time.
1798: * Since we now have an rtt measurement, cancel the
1799: * timer backoff (cf., Phil Karn's retransmit alg.).
1800: * Recompute the initial retransmit timer.
1801: */
1802: if (opti.ts_present && opti.ts_ecr)
1803: tcp_xmit_timer(tp, tcp_now - opti.ts_ecr);
1804: else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
1805: tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
1806:
1807: /*
1808: * If all outstanding data is acked, stop retransmit
1809: * timer and remember to restart (more output or persist).
1810: * If there is more data to be acked, restart retransmit
1811: * timer, using current (possibly backed-off) value.
1812: */
1813: if (th->th_ack == tp->snd_max) {
1814: TCP_TIMER_DISARM(tp, TCPT_REXMT);
1815: needoutput = 1;
1816: } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
1817: TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
1818: /*
1819: * When new data is acked, open the congestion window.
1820: * If the window gives us less than ssthresh packets
1821: * in flight, open exponentially (maxseg per packet).
1822: * Otherwise open linearly: maxseg per window
1823: * (maxseg^2 / cwnd per packet).
1824: */
1825: {
1826: u_int cw = tp->snd_cwnd;
1827: u_int incr = tp->t_maxseg;
1828:
1829: if (cw > tp->snd_ssthresh)
1830: incr = incr * incr / cw;
1831: #if defined (TCP_SACK)
1832: if (tp->t_dupacks < tcprexmtthresh)
1833: #endif
1834: tp->snd_cwnd = ulmin(cw + incr, TCP_MAXWIN<<tp->snd_scale);
1835: }
1836: ND6_HINT(tp);
1837: if (acked > so->so_snd.sb_cc) {
1838: tp->snd_wnd -= so->so_snd.sb_cc;
1839: sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
1840: ourfinisacked = 1;
1841: } else {
1842: sbdrop(&so->so_snd, acked);
1843: tp->snd_wnd -= acked;
1844: ourfinisacked = 0;
1845: }
1846: if (sb_notify(&so->so_snd))
1847: sowwakeup(so);
1848:
1849: /*
1850: * If we had a pending ICMP message that referred to data
1851: * that have just been acknowledged, disregard the recorded
1852: * ICMP message.
1853: */
1854: if ((tp->t_flags & TF_PMTUD_PEND) &&
1855: SEQ_GT(th->th_ack, tp->t_pmtud_th_seq))
1856: tp->t_flags &= ~TF_PMTUD_PEND;
1857:
1858: /*
1859: * Keep track of the largest chunk of data acknowledged
1860: * since last PMTU update
1861: */
1862: if (tp->t_pmtud_mss_acked < acked)
1863: tp->t_pmtud_mss_acked = acked;
1864:
1865: tp->snd_una = th->th_ack;
1866: #ifdef TCP_ECN
1867: /* sync snd_last with snd_una */
1868: if (SEQ_GT(tp->snd_una, tp->snd_last))
1869: tp->snd_last = tp->snd_una;
1870: #endif
1871: if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1872: tp->snd_nxt = tp->snd_una;
1873: #if defined (TCP_SACK) && defined (TCP_FACK)
1874: if (SEQ_GT(tp->snd_una, tp->snd_fack)) {
1875: tp->snd_fack = tp->snd_una;
1876: /* Update snd_awnd for partial ACK
1877: * without any SACK blocks.
1878: */
1879: tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt,
1880: tp->snd_fack) + tp->retran_data;
1881: }
1882: #endif
1883:
1884: switch (tp->t_state) {
1885:
1886: /*
1887: * In FIN_WAIT_1 STATE in addition to the processing
1888: * for the ESTABLISHED state if our FIN is now acknowledged
1889: * then enter FIN_WAIT_2.
1890: */
1891: case TCPS_FIN_WAIT_1:
1892: if (ourfinisacked) {
1893: /*
1894: * If we can't receive any more
1895: * data, then closing user can proceed.
1896: * Starting the timer is contrary to the
1897: * specification, but if we don't get a FIN
1898: * we'll hang forever.
1899: */
1900: if (so->so_state & SS_CANTRCVMORE) {
1901: soisdisconnected(so);
1902: TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle);
1903: }
1904: tp->t_state = TCPS_FIN_WAIT_2;
1905: }
1906: break;
1907:
1908: /*
1909: * In CLOSING STATE in addition to the processing for
1910: * the ESTABLISHED state if the ACK acknowledges our FIN
1911: * then enter the TIME-WAIT state, otherwise ignore
1912: * the segment.
1913: */
1914: case TCPS_CLOSING:
1915: if (ourfinisacked) {
1916: tp->t_state = TCPS_TIME_WAIT;
1917: tcp_canceltimers(tp);
1918: TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
1919: soisdisconnected(so);
1920: }
1921: break;
1922:
1923: /*
1924: * In LAST_ACK, we may still be waiting for data to drain
1925: * and/or to be acked, as well as for the ack of our FIN.
1926: * If our FIN is now acknowledged, delete the TCB,
1927: * enter the closed state and return.
1928: */
1929: case TCPS_LAST_ACK:
1930: if (ourfinisacked) {
1931: tp = tcp_close(tp);
1932: goto drop;
1933: }
1934: break;
1935:
1936: /*
1937: * In TIME_WAIT state the only thing that should arrive
1938: * is a retransmission of the remote FIN. Acknowledge
1939: * it and restart the finack timer.
1940: */
1941: case TCPS_TIME_WAIT:
1942: TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
1943: goto dropafterack;
1944: }
1945: }
1946:
1947: step6:
1948: /*
1949: * Update window information.
1950: * Don't look at window if no ACK: TAC's send garbage on first SYN.
1951: */
1952: if ((tiflags & TH_ACK) &&
1953: (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq &&
1954: (SEQ_LT(tp->snd_wl2, th->th_ack) ||
1955: (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
1956: /* keep track of pure window updates */
1957: if (tlen == 0 &&
1958: tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
1959: tcpstat.tcps_rcvwinupd++;
1960: tp->snd_wnd = tiwin;
1961: tp->snd_wl1 = th->th_seq;
1962: tp->snd_wl2 = th->th_ack;
1963: if (tp->snd_wnd > tp->max_sndwnd)
1964: tp->max_sndwnd = tp->snd_wnd;
1965: needoutput = 1;
1966: }
1967:
1968: /*
1969: * Process segments with URG.
1970: */
1971: if ((tiflags & TH_URG) && th->th_urp &&
1972: TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1973: /*
1974: * This is a kludge, but if we receive and accept
1975: * random urgent pointers, we'll crash in
1976: * soreceive. It's hard to imagine someone
1977: * actually wanting to send this much urgent data.
1978: */
1979: if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
1980: th->th_urp = 0; /* XXX */
1981: tiflags &= ~TH_URG; /* XXX */
1982: goto dodata; /* XXX */
1983: }
1984: /*
1985: * If this segment advances the known urgent pointer,
1986: * then mark the data stream. This should not happen
1987: * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
1988: * a FIN has been received from the remote side.
1989: * In these states we ignore the URG.
1990: *
1991: * According to RFC961 (Assigned Protocols),
1992: * the urgent pointer points to the last octet
1993: * of urgent data. We continue, however,
1994: * to consider it to indicate the first octet
1995: * of data past the urgent section as the original
1996: * spec states (in one of two places).
1997: */
1998: if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
1999: tp->rcv_up = th->th_seq + th->th_urp;
2000: so->so_oobmark = so->so_rcv.sb_cc +
2001: (tp->rcv_up - tp->rcv_nxt) - 1;
2002: if (so->so_oobmark == 0)
2003: so->so_state |= SS_RCVATMARK;
2004: sohasoutofband(so);
2005: tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
2006: }
2007: /*
2008: * Remove out of band data so doesn't get presented to user.
2009: * This can happen independent of advancing the URG pointer,
2010: * but if two URG's are pending at once, some out-of-band
2011: * data may creep in... ick.
2012: */
2013: if (th->th_urp <= (u_int16_t) tlen
2014: #ifdef SO_OOBINLINE
2015: && (so->so_options & SO_OOBINLINE) == 0
2016: #endif
2017: )
2018: tcp_pulloutofband(so, th->th_urp, m, hdroptlen);
2019: } else
2020: /*
2021: * If no out of band data is expected,
2022: * pull receive urgent pointer along
2023: * with the receive window.
2024: */
2025: if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
2026: tp->rcv_up = tp->rcv_nxt;
2027: dodata: /* XXX */
2028:
2029: /*
2030: * Process the segment text, merging it into the TCP sequencing queue,
2031: * and arranging for acknowledgment of receipt if necessary.
2032: * This process logically involves adjusting tp->rcv_wnd as data
2033: * is presented to the user (this happens in tcp_usrreq.c,
2034: * case PRU_RCVD). If a FIN has already been received on this
2035: * connection then we just ignore the text.
2036: */
2037: if ((tlen || (tiflags & TH_FIN)) &&
2038: TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2039: #ifdef TCP_SACK
2040: tcp_seq laststart = th->th_seq;
2041: tcp_seq lastend = th->th_seq + tlen;
2042: #endif
2043: tcp_reass_lock(tp);
2044: if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) &&
2045: tp->t_state == TCPS_ESTABLISHED) {
2046: tcp_reass_unlock(tp);
2047: TCP_SETUP_ACK(tp, tiflags);
2048: tp->rcv_nxt += tlen;
2049: tiflags = th->th_flags & TH_FIN;
2050: tcpstat.tcps_rcvpack++;
2051: tcpstat.tcps_rcvbyte += tlen;
2052: ND6_HINT(tp);
2053: if (so->so_state & SS_CANTRCVMORE)
2054: m_freem(m);
2055: else {
2056: m_adj(m, hdroptlen);
2057: sbappendstream(&so->so_rcv, m);
2058: }
2059: sorwakeup(so);
2060: } else {
2061: m_adj(m, hdroptlen);
2062: tiflags = tcp_reass(tp, th, m, &tlen);
2063: tcp_reass_unlock(tp);
2064: tp->t_flags |= TF_ACKNOW;
2065: }
2066: #ifdef TCP_SACK
2067: if (tp->sack_enable)
2068: tcp_update_sack_list(tp, laststart, lastend);
2069: #endif
2070:
2071: /*
2072: * variable len never referenced again in modern BSD,
2073: * so why bother computing it ??
2074: */
2075: #if 0
2076: /*
2077: * Note the amount of data that peer has sent into
2078: * our window, in order to estimate the sender's
2079: * buffer size.
2080: */
2081: len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
2082: #endif /* 0 */
2083: } else {
2084: m_freem(m);
2085: tiflags &= ~TH_FIN;
2086: }
2087:
2088: /*
2089: * If FIN is received ACK the FIN and let the user know
2090: * that the connection is closing. Ignore a FIN received before
2091: * the connection is fully established.
2092: */
2093: if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) {
2094: if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2095: socantrcvmore(so);
2096: tp->t_flags |= TF_ACKNOW;
2097: tp->rcv_nxt++;
2098: }
2099: switch (tp->t_state) {
2100:
2101: /*
2102: * In ESTABLISHED STATE enter the CLOSE_WAIT state.
2103: */
2104: case TCPS_ESTABLISHED:
2105: tp->t_state = TCPS_CLOSE_WAIT;
2106: break;
2107:
2108: /*
2109: * If still in FIN_WAIT_1 STATE FIN has not been acked so
2110: * enter the CLOSING state.
2111: */
2112: case TCPS_FIN_WAIT_1:
2113: tp->t_state = TCPS_CLOSING;
2114: break;
2115:
2116: /*
2117: * In FIN_WAIT_2 state enter the TIME_WAIT state,
2118: * starting the time-wait timer, turning off the other
2119: * standard timers.
2120: */
2121: case TCPS_FIN_WAIT_2:
2122: tp->t_state = TCPS_TIME_WAIT;
2123: tcp_canceltimers(tp);
2124: TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
2125: soisdisconnected(so);
2126: break;
2127:
2128: /*
2129: * In TIME_WAIT state restart the 2 MSL time_wait timer.
2130: */
2131: case TCPS_TIME_WAIT:
2132: TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
2133: break;
2134: }
2135: }
2136: if (so->so_options & SO_DEBUG) {
2137: switch (tp->pf) {
2138: #ifdef INET6
2139: case PF_INET6:
2140: tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti6,
2141: 0, tlen);
2142: break;
2143: #endif /* INET6 */
2144: case PF_INET:
2145: tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti,
2146: 0, tlen);
2147: break;
2148: }
2149: }
2150:
2151: /*
2152: * Return any desired output.
2153: */
2154: if (needoutput || (tp->t_flags & TF_ACKNOW)) {
2155: (void) tcp_output(tp);
2156: }
2157: return;
2158:
2159: badsyn:
2160: /*
2161: * Received a bad SYN. Increment counters and dropwithreset.
2162: */
2163: tcpstat.tcps_badsyn++;
2164: tp = NULL;
2165: goto dropwithreset;
2166:
2167: dropafterack_ratelim:
2168: if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count,
2169: tcp_ackdrop_ppslim) == 0) {
2170: /* XXX stat */
2171: goto drop;
2172: }
2173: /* ...fall into dropafterack... */
2174:
2175: dropafterack:
2176: /*
2177: * Generate an ACK dropping incoming segment if it occupies
2178: * sequence space, where the ACK reflects our state.
2179: */
2180: if (tiflags & TH_RST)
2181: goto drop;
2182: m_freem(m);
2183: tp->t_flags |= TF_ACKNOW;
2184: (void) tcp_output(tp);
2185: return;
2186:
2187: dropwithreset_ratelim:
2188: /*
2189: * We may want to rate-limit RSTs in certain situations,
2190: * particularly if we are sending an RST in response to
2191: * an attempt to connect to or otherwise communicate with
2192: * a port for which we have no socket.
2193: */
2194: if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count,
2195: tcp_rst_ppslim) == 0) {
2196: /* XXX stat */
2197: goto drop;
2198: }
2199: /* ...fall into dropwithreset... */
2200:
2201: dropwithreset:
2202: /*
2203: * Generate a RST, dropping incoming segment.
2204: * Make ACK acceptable to originator of segment.
2205: * Don't bother to respond to RST.
2206: */
2207: if (tiflags & TH_RST)
2208: goto drop;
2209: if (tiflags & TH_ACK) {
2210: tcp_respond(tp, mtod(m, caddr_t), m, (tcp_seq)0, th->th_ack,
2211: TH_RST);
2212: } else {
2213: if (tiflags & TH_SYN)
2214: tlen++;
2215: tcp_respond(tp, mtod(m, caddr_t), m, th->th_seq + tlen,
2216: (tcp_seq)0, TH_RST|TH_ACK);
2217: }
2218: return;
2219:
2220: drop:
2221: /*
2222: * Drop space held by incoming segment and return.
2223: */
2224: if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) {
2225: switch (tp->pf) {
2226: #ifdef INET6
2227: case PF_INET6:
2228: tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti6,
2229: 0, tlen);
2230: break;
2231: #endif /* INET6 */
2232: case PF_INET:
2233: tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti,
2234: 0, tlen);
2235: break;
2236: }
2237: }
2238:
2239: m_freem(m);
2240: return;
2241: }
2242:
2243: int
2244: tcp_dooptions(tp, cp, cnt, th, m, iphlen, oi)
2245: struct tcpcb *tp;
2246: u_char *cp;
2247: int cnt;
2248: struct tcphdr *th;
2249: struct mbuf *m;
2250: int iphlen;
2251: struct tcp_opt_info *oi;
2252: {
2253: u_int16_t mss = 0;
2254: int opt, optlen;
2255: #ifdef TCP_SIGNATURE
2256: caddr_t sigp = NULL;
2257: struct tdb *tdb = NULL;
2258: #endif /* TCP_SIGNATURE */
2259:
2260: for (; cp && cnt > 0; cnt -= optlen, cp += optlen) {
2261: opt = cp[0];
2262: if (opt == TCPOPT_EOL)
2263: break;
2264: if (opt == TCPOPT_NOP)
2265: optlen = 1;
2266: else {
2267: if (cnt < 2)
2268: break;
2269: optlen = cp[1];
2270: if (optlen < 2 || optlen > cnt)
2271: break;
2272: }
2273: switch (opt) {
2274:
2275: default:
2276: continue;
2277:
2278: case TCPOPT_MAXSEG:
2279: if (optlen != TCPOLEN_MAXSEG)
2280: continue;
2281: if (!(th->th_flags & TH_SYN))
2282: continue;
2283: if (TCPS_HAVERCVDSYN(tp->t_state))
2284: continue;
2285: bcopy((char *) cp + 2, (char *) &mss, sizeof(mss));
2286: NTOHS(mss);
2287: oi->maxseg = mss;
2288: break;
2289:
2290: case TCPOPT_WINDOW:
2291: if (optlen != TCPOLEN_WINDOW)
2292: continue;
2293: if (!(th->th_flags & TH_SYN))
2294: continue;
2295: if (TCPS_HAVERCVDSYN(tp->t_state))
2296: continue;
2297: tp->t_flags |= TF_RCVD_SCALE;
2298: tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
2299: break;
2300:
2301: case TCPOPT_TIMESTAMP:
2302: if (optlen != TCPOLEN_TIMESTAMP)
2303: continue;
2304: oi->ts_present = 1;
2305: bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val));
2306: NTOHL(oi->ts_val);
2307: bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr));
2308: NTOHL(oi->ts_ecr);
2309:
2310: if (!(th->th_flags & TH_SYN))
2311: continue;
2312: if (TCPS_HAVERCVDSYN(tp->t_state))
2313: continue;
2314: /*
2315: * A timestamp received in a SYN makes
2316: * it ok to send timestamp requests and replies.
2317: */
2318: tp->t_flags |= TF_RCVD_TSTMP;
2319: tp->ts_recent = oi->ts_val;
2320: tp->ts_recent_age = tcp_now;
2321: break;
2322:
2323: #ifdef TCP_SACK
2324: case TCPOPT_SACK_PERMITTED:
2325: if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED)
2326: continue;
2327: if (!(th->th_flags & TH_SYN))
2328: continue;
2329: if (TCPS_HAVERCVDSYN(tp->t_state))
2330: continue;
2331: /* MUST only be set on SYN */
2332: tp->t_flags |= TF_SACK_PERMIT;
2333: break;
2334: case TCPOPT_SACK:
2335: tcp_sack_option(tp, th, cp, optlen);
2336: break;
2337: #endif
2338: #ifdef TCP_SIGNATURE
2339: case TCPOPT_SIGNATURE:
2340: if (optlen != TCPOLEN_SIGNATURE)
2341: continue;
2342:
2343: if (sigp && bcmp(sigp, cp + 2, 16))
2344: return (-1);
2345:
2346: sigp = cp + 2;
2347: break;
2348: #endif /* TCP_SIGNATURE */
2349: }
2350: }
2351:
2352: #ifdef TCP_SIGNATURE
2353: if (tp->t_flags & TF_SIGNATURE) {
2354: union sockaddr_union src, dst;
2355:
2356: memset(&src, 0, sizeof(union sockaddr_union));
2357: memset(&dst, 0, sizeof(union sockaddr_union));
2358:
2359: switch (tp->pf) {
2360: case 0:
2361: #ifdef INET
2362: case AF_INET:
2363: src.sa.sa_len = sizeof(struct sockaddr_in);
2364: src.sa.sa_family = AF_INET;
2365: src.sin.sin_addr = mtod(m, struct ip *)->ip_src;
2366: dst.sa.sa_len = sizeof(struct sockaddr_in);
2367: dst.sa.sa_family = AF_INET;
2368: dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst;
2369: break;
2370: #endif
2371: #ifdef INET6
2372: case AF_INET6:
2373: src.sa.sa_len = sizeof(struct sockaddr_in6);
2374: src.sa.sa_family = AF_INET6;
2375: src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src;
2376: dst.sa.sa_len = sizeof(struct sockaddr_in6);
2377: dst.sa.sa_family = AF_INET6;
2378: dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst;
2379: break;
2380: #endif /* INET6 */
2381: }
2382:
2383: tdb = gettdbbysrcdst(0, &src, &dst, IPPROTO_TCP);
2384:
2385: /*
2386: * We don't have an SA for this peer, so we turn off
2387: * TF_SIGNATURE on the listen socket
2388: */
2389: if (tdb == NULL && tp->t_state == TCPS_LISTEN)
2390: tp->t_flags &= ~TF_SIGNATURE;
2391:
2392: }
2393:
2394: if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) {
2395: tcpstat.tcps_rcvbadsig++;
2396: return (-1);
2397: }
2398:
2399: if (sigp) {
2400: char sig[16];
2401:
2402: if (tdb == NULL) {
2403: tcpstat.tcps_rcvbadsig++;
2404: return (-1);
2405: }
2406:
2407: if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0)
2408: return (-1);
2409:
2410: if (bcmp(sig, sigp, 16)) {
2411: tcpstat.tcps_rcvbadsig++;
2412: return (-1);
2413: }
2414:
2415: tcpstat.tcps_rcvgoodsig++;
2416: }
2417: #endif /* TCP_SIGNATURE */
2418:
2419: return (0);
2420: }
2421:
2422: #if defined(TCP_SACK)
2423: u_long
2424: tcp_seq_subtract(a, b)
2425: u_long a, b;
2426: {
2427: return ((long)(a - b));
2428: }
2429: #endif
2430:
2431:
2432: #ifdef TCP_SACK
2433: /*
2434: * This function is called upon receipt of new valid data (while not in header
2435: * prediction mode), and it updates the ordered list of sacks.
2436: */
2437: void
2438: tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart,
2439: tcp_seq rcv_lastend)
2440: {
2441: /*
2442: * First reported block MUST be the most recent one. Subsequent
2443: * blocks SHOULD be in the order in which they arrived at the
2444: * receiver. These two conditions make the implementation fully
2445: * compliant with RFC 2018.
2446: */
2447: int i, j = 0, count = 0, lastpos = -1;
2448: struct sackblk sack, firstsack, temp[MAX_SACK_BLKS];
2449:
2450: /* First clean up current list of sacks */
2451: for (i = 0; i < tp->rcv_numsacks; i++) {
2452: sack = tp->sackblks[i];
2453: if (sack.start == 0 && sack.end == 0) {
2454: count++; /* count = number of blocks to be discarded */
2455: continue;
2456: }
2457: if (SEQ_LEQ(sack.end, tp->rcv_nxt)) {
2458: tp->sackblks[i].start = tp->sackblks[i].end = 0;
2459: count++;
2460: } else {
2461: temp[j].start = tp->sackblks[i].start;
2462: temp[j++].end = tp->sackblks[i].end;
2463: }
2464: }
2465: tp->rcv_numsacks -= count;
2466: if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */
2467: tcp_clean_sackreport(tp);
2468: if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) {
2469: /* ==> need first sack block */
2470: tp->sackblks[0].start = rcv_laststart;
2471: tp->sackblks[0].end = rcv_lastend;
2472: tp->rcv_numsacks = 1;
2473: }
2474: return;
2475: }
2476: /* Otherwise, sack blocks are already present. */
2477: for (i = 0; i < tp->rcv_numsacks; i++)
2478: tp->sackblks[i] = temp[i]; /* first copy back sack list */
2479: if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend))
2480: return; /* sack list remains unchanged */
2481: /*
2482: * From here, segment just received should be (part of) the 1st sack.
2483: * Go through list, possibly coalescing sack block entries.
2484: */
2485: firstsack.start = rcv_laststart;
2486: firstsack.end = rcv_lastend;
2487: for (i = 0; i < tp->rcv_numsacks; i++) {
2488: sack = tp->sackblks[i];
2489: if (SEQ_LT(sack.end, firstsack.start) ||
2490: SEQ_GT(sack.start, firstsack.end))
2491: continue; /* no overlap */
2492: if (sack.start == firstsack.start && sack.end == firstsack.end){
2493: /*
2494: * identical block; delete it here since we will
2495: * move it to the front of the list.
2496: */
2497: tp->sackblks[i].start = tp->sackblks[i].end = 0;
2498: lastpos = i; /* last posn with a zero entry */
2499: continue;
2500: }
2501: if (SEQ_LEQ(sack.start, firstsack.start))
2502: firstsack.start = sack.start; /* merge blocks */
2503: if (SEQ_GEQ(sack.end, firstsack.end))
2504: firstsack.end = sack.end; /* merge blocks */
2505: tp->sackblks[i].start = tp->sackblks[i].end = 0;
2506: lastpos = i; /* last posn with a zero entry */
2507: }
2508: if (lastpos != -1) { /* at least one merge */
2509: for (i = 0, j = 1; i < tp->rcv_numsacks; i++) {
2510: sack = tp->sackblks[i];
2511: if (sack.start == 0 && sack.end == 0)
2512: continue;
2513: temp[j++] = sack;
2514: }
2515: tp->rcv_numsacks = j; /* including first blk (added later) */
2516: for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */
2517: tp->sackblks[i] = temp[i];
2518: } else { /* no merges -- shift sacks by 1 */
2519: if (tp->rcv_numsacks < MAX_SACK_BLKS)
2520: tp->rcv_numsacks++;
2521: for (i = tp->rcv_numsacks-1; i > 0; i--)
2522: tp->sackblks[i] = tp->sackblks[i-1];
2523: }
2524: tp->sackblks[0] = firstsack;
2525: return;
2526: }
2527:
2528: /*
2529: * Process the TCP SACK option. tp->snd_holes is an ordered list
2530: * of holes (oldest to newest, in terms of the sequence space).
2531: */
2532: void
2533: tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen)
2534: {
2535: int tmp_olen;
2536: u_char *tmp_cp;
2537: struct sackhole *cur, *p, *temp;
2538:
2539: if (!tp->sack_enable)
2540: return;
2541: /* SACK without ACK doesn't make sense. */
2542: if ((th->th_flags & TH_ACK) == 0)
2543: return;
2544: /* Make sure the ACK on this segment is in [snd_una, snd_max]. */
2545: if (SEQ_LT(th->th_ack, tp->snd_una) ||
2546: SEQ_GT(th->th_ack, tp->snd_max))
2547: return;
2548: /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */
2549: if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
2550: return;
2551: /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */
2552: tmp_cp = cp + 2;
2553: tmp_olen = optlen - 2;
2554: tcpstat.tcps_sack_rcv_opts++;
2555: if (tp->snd_numholes < 0)
2556: tp->snd_numholes = 0;
2557: if (tp->t_maxseg == 0)
2558: panic("tcp_sack_option"); /* Should never happen */
2559: while (tmp_olen > 0) {
2560: struct sackblk sack;
2561:
2562: bcopy(tmp_cp, (char *) &(sack.start), sizeof(tcp_seq));
2563: NTOHL(sack.start);
2564: bcopy(tmp_cp + sizeof(tcp_seq),
2565: (char *) &(sack.end), sizeof(tcp_seq));
2566: NTOHL(sack.end);
2567: tmp_olen -= TCPOLEN_SACK;
2568: tmp_cp += TCPOLEN_SACK;
2569: if (SEQ_LEQ(sack.end, sack.start))
2570: continue; /* bad SACK fields */
2571: if (SEQ_LEQ(sack.end, tp->snd_una))
2572: continue; /* old block */
2573: #if defined(TCP_SACK) && defined(TCP_FACK)
2574: /* Updates snd_fack. */
2575: if (SEQ_GT(sack.end, tp->snd_fack))
2576: tp->snd_fack = sack.end;
2577: #endif /* TCP_FACK */
2578: if (SEQ_GT(th->th_ack, tp->snd_una)) {
2579: if (SEQ_LT(sack.start, th->th_ack))
2580: continue;
2581: }
2582: if (SEQ_GT(sack.end, tp->snd_max))
2583: continue;
2584: if (tp->snd_holes == NULL) { /* first hole */
2585: tp->snd_holes = (struct sackhole *)
2586: pool_get(&sackhl_pool, PR_NOWAIT);
2587: if (tp->snd_holes == NULL) {
2588: /* ENOBUFS, so ignore SACKed block for now*/
2589: goto done;
2590: }
2591: cur = tp->snd_holes;
2592: cur->start = th->th_ack;
2593: cur->end = sack.start;
2594: cur->rxmit = cur->start;
2595: cur->next = NULL;
2596: tp->snd_numholes = 1;
2597: tp->rcv_lastsack = sack.end;
2598: /*
2599: * dups is at least one. If more data has been
2600: * SACKed, it can be greater than one.
2601: */
2602: cur->dups = min(tcprexmtthresh,
2603: ((sack.end - cur->end)/tp->t_maxseg));
2604: if (cur->dups < 1)
2605: cur->dups = 1;
2606: continue; /* with next sack block */
2607: }
2608: /* Go thru list of holes: p = previous, cur = current */
2609: p = cur = tp->snd_holes;
2610: while (cur) {
2611: if (SEQ_LEQ(sack.end, cur->start))
2612: /* SACKs data before the current hole */
2613: break; /* no use going through more holes */
2614: if (SEQ_GEQ(sack.start, cur->end)) {
2615: /* SACKs data beyond the current hole */
2616: cur->dups++;
2617: if (((sack.end - cur->end)/tp->t_maxseg) >=
2618: tcprexmtthresh)
2619: cur->dups = tcprexmtthresh;
2620: p = cur;
2621: cur = cur->next;
2622: continue;
2623: }
2624: if (SEQ_LEQ(sack.start, cur->start)) {
2625: /* Data acks at least the beginning of hole */
2626: #if defined(TCP_SACK) && defined(TCP_FACK)
2627: if (SEQ_GT(sack.end, cur->rxmit))
2628: tp->retran_data -=
2629: tcp_seq_subtract(cur->rxmit,
2630: cur->start);
2631: else
2632: tp->retran_data -=
2633: tcp_seq_subtract(sack.end,
2634: cur->start);
2635: #endif /* TCP_FACK */
2636: if (SEQ_GEQ(sack.end, cur->end)) {
2637: /* Acks entire hole, so delete hole */
2638: if (p != cur) {
2639: p->next = cur->next;
2640: pool_put(&sackhl_pool, cur);
2641: cur = p->next;
2642: } else {
2643: cur = cur->next;
2644: pool_put(&sackhl_pool, p);
2645: p = cur;
2646: tp->snd_holes = p;
2647: }
2648: tp->snd_numholes--;
2649: continue;
2650: }
2651: /* otherwise, move start of hole forward */
2652: cur->start = sack.end;
2653: cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
2654: p = cur;
2655: cur = cur->next;
2656: continue;
2657: }
2658: /* move end of hole backward */
2659: if (SEQ_GEQ(sack.end, cur->end)) {
2660: #if defined(TCP_SACK) && defined(TCP_FACK)
2661: if (SEQ_GT(cur->rxmit, sack.start))
2662: tp->retran_data -=
2663: tcp_seq_subtract(cur->rxmit,
2664: sack.start);
2665: #endif /* TCP_FACK */
2666: cur->end = sack.start;
2667: cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
2668: cur->dups++;
2669: if (((sack.end - cur->end)/tp->t_maxseg) >=
2670: tcprexmtthresh)
2671: cur->dups = tcprexmtthresh;
2672: p = cur;
2673: cur = cur->next;
2674: continue;
2675: }
2676: if (SEQ_LT(cur->start, sack.start) &&
2677: SEQ_GT(cur->end, sack.end)) {
2678: /*
2679: * ACKs some data in middle of a hole; need to
2680: * split current hole
2681: */
2682: temp = (struct sackhole *)
2683: pool_get(&sackhl_pool, PR_NOWAIT);
2684: if (temp == NULL)
2685: goto done; /* ENOBUFS */
2686: #if defined(TCP_SACK) && defined(TCP_FACK)
2687: if (SEQ_GT(cur->rxmit, sack.end))
2688: tp->retran_data -=
2689: tcp_seq_subtract(sack.end,
2690: sack.start);
2691: else if (SEQ_GT(cur->rxmit, sack.start))
2692: tp->retran_data -=
2693: tcp_seq_subtract(cur->rxmit,
2694: sack.start);
2695: #endif /* TCP_FACK */
2696: temp->next = cur->next;
2697: temp->start = sack.end;
2698: temp->end = cur->end;
2699: temp->dups = cur->dups;
2700: temp->rxmit = SEQ_MAX(cur->rxmit, temp->start);
2701: cur->end = sack.start;
2702: cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
2703: cur->dups++;
2704: if (((sack.end - cur->end)/tp->t_maxseg) >=
2705: tcprexmtthresh)
2706: cur->dups = tcprexmtthresh;
2707: cur->next = temp;
2708: p = temp;
2709: cur = p->next;
2710: tp->snd_numholes++;
2711: }
2712: }
2713: /* At this point, p points to the last hole on the list */
2714: if (SEQ_LT(tp->rcv_lastsack, sack.start)) {
2715: /*
2716: * Need to append new hole at end.
2717: * Last hole is p (and it's not NULL).
2718: */
2719: temp = (struct sackhole *)
2720: pool_get(&sackhl_pool, PR_NOWAIT);
2721: if (temp == NULL)
2722: goto done; /* ENOBUFS */
2723: temp->start = tp->rcv_lastsack;
2724: temp->end = sack.start;
2725: temp->dups = min(tcprexmtthresh,
2726: ((sack.end - sack.start)/tp->t_maxseg));
2727: if (temp->dups < 1)
2728: temp->dups = 1;
2729: temp->rxmit = temp->start;
2730: temp->next = 0;
2731: p->next = temp;
2732: tp->rcv_lastsack = sack.end;
2733: tp->snd_numholes++;
2734: }
2735: }
2736: done:
2737: #if defined(TCP_SACK) && defined(TCP_FACK)
2738: /*
2739: * Update retran_data and snd_awnd. Go through the list of
2740: * holes. Increment retran_data by (hole->rxmit - hole->start).
2741: */
2742: tp->retran_data = 0;
2743: cur = tp->snd_holes;
2744: while (cur) {
2745: tp->retran_data += cur->rxmit - cur->start;
2746: cur = cur->next;
2747: }
2748: tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) +
2749: tp->retran_data;
2750: #endif /* TCP_FACK */
2751:
2752: return;
2753: }
2754:
2755: /*
2756: * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if
2757: * it is completely acked; otherwise, tcp_sack_option(), called from
2758: * tcp_dooptions(), will fix up the hole.
2759: */
2760: void
2761: tcp_del_sackholes(tp, th)
2762: struct tcpcb *tp;
2763: struct tcphdr *th;
2764: {
2765: if (tp->sack_enable && tp->t_state != TCPS_LISTEN) {
2766: /* max because this could be an older ack just arrived */
2767: tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ?
2768: th->th_ack : tp->snd_una;
2769: struct sackhole *cur = tp->snd_holes;
2770: struct sackhole *prev;
2771: while (cur)
2772: if (SEQ_LEQ(cur->end, lastack)) {
2773: prev = cur;
2774: cur = cur->next;
2775: pool_put(&sackhl_pool, prev);
2776: tp->snd_numholes--;
2777: } else if (SEQ_LT(cur->start, lastack)) {
2778: cur->start = lastack;
2779: if (SEQ_LT(cur->rxmit, cur->start))
2780: cur->rxmit = cur->start;
2781: break;
2782: } else
2783: break;
2784: tp->snd_holes = cur;
2785: }
2786: }
2787:
2788: /*
2789: * Delete all receiver-side SACK information.
2790: */
2791: void
2792: tcp_clean_sackreport(tp)
2793: struct tcpcb *tp;
2794: {
2795: int i;
2796:
2797: tp->rcv_numsacks = 0;
2798: for (i = 0; i < MAX_SACK_BLKS; i++)
2799: tp->sackblks[i].start = tp->sackblks[i].end=0;
2800:
2801: }
2802:
2803: /*
2804: * Checks for partial ack. If partial ack arrives, turn off retransmission
2805: * timer, deflate the window, do not clear tp->t_dupacks, and return 1.
2806: * If the ack advances at least to tp->snd_last, return 0.
2807: */
2808: int
2809: tcp_sack_partialack(tp, th)
2810: struct tcpcb *tp;
2811: struct tcphdr *th;
2812: {
2813: if (SEQ_LT(th->th_ack, tp->snd_last)) {
2814: /* Turn off retx. timer (will start again next segment) */
2815: TCP_TIMER_DISARM(tp, TCPT_REXMT);
2816: tp->t_rtttime = 0;
2817: #ifndef TCP_FACK
2818: /*
2819: * Partial window deflation. This statement relies on the
2820: * fact that tp->snd_una has not been updated yet. In FACK
2821: * hold snd_cwnd constant during fast recovery.
2822: */
2823: if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) {
2824: tp->snd_cwnd -= th->th_ack - tp->snd_una;
2825: tp->snd_cwnd += tp->t_maxseg;
2826: } else
2827: tp->snd_cwnd = tp->t_maxseg;
2828: #endif
2829: return (1);
2830: }
2831: return (0);
2832: }
2833: #endif /* TCP_SACK */
2834:
2835: /*
2836: * Pull out of band byte out of a segment so
2837: * it doesn't appear in the user's data queue.
2838: * It is still reflected in the segment length for
2839: * sequencing purposes.
2840: */
2841: void
2842: tcp_pulloutofband(so, urgent, m, off)
2843: struct socket *so;
2844: u_int urgent;
2845: struct mbuf *m;
2846: int off;
2847: {
2848: int cnt = off + urgent - 1;
2849:
2850: while (cnt >= 0) {
2851: if (m->m_len > cnt) {
2852: char *cp = mtod(m, caddr_t) + cnt;
2853: struct tcpcb *tp = sototcpcb(so);
2854:
2855: tp->t_iobc = *cp;
2856: tp->t_oobflags |= TCPOOB_HAVEDATA;
2857: bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
2858: m->m_len--;
2859: return;
2860: }
2861: cnt -= m->m_len;
2862: m = m->m_next;
2863: if (m == 0)
2864: break;
2865: }
2866: panic("tcp_pulloutofband");
2867: }
2868:
2869: /*
2870: * Collect new round-trip time estimate
2871: * and update averages and current timeout.
2872: */
2873: void
2874: tcp_xmit_timer(tp, rtt)
2875: struct tcpcb *tp;
2876: short rtt;
2877: {
2878: short delta;
2879: short rttmin;
2880:
2881: if (rtt < 0)
2882: rtt = 0;
2883: else if (rtt > TCP_RTT_MAX)
2884: rtt = TCP_RTT_MAX;
2885:
2886: tcpstat.tcps_rttupdated++;
2887: if (tp->t_srtt != 0) {
2888: /*
2889: * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits
2890: * after the binary point (scaled by 4), whereas
2891: * srtt is stored as fixed point with 5 bits after the
2892: * binary point (i.e., scaled by 32). The following magic
2893: * is equivalent to the smoothing algorithm in rfc793 with
2894: * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
2895: * point).
2896: */
2897: delta = (rtt << TCP_RTT_BASE_SHIFT) -
2898: (tp->t_srtt >> TCP_RTT_SHIFT);
2899: if ((tp->t_srtt += delta) <= 0)
2900: tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT;
2901: /*
2902: * We accumulate a smoothed rtt variance (actually, a
2903: * smoothed mean difference), then set the retransmit
2904: * timer to smoothed rtt + 4 times the smoothed variance.
2905: * rttvar is stored as fixed point with 4 bits after the
2906: * binary point (scaled by 16). The following is
2907: * equivalent to rfc793 smoothing with an alpha of .75
2908: * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
2909: * rfc793's wired-in beta.
2910: */
2911: if (delta < 0)
2912: delta = -delta;
2913: delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
2914: if ((tp->t_rttvar += delta) <= 0)
2915: tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT;
2916: } else {
2917: /*
2918: * No rtt measurement yet - use the unsmoothed rtt.
2919: * Set the variance to half the rtt (so our first
2920: * retransmit happens at 3*rtt).
2921: */
2922: tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT);
2923: tp->t_rttvar = (rtt + 1) <<
2924: (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1);
2925: }
2926: tp->t_rtttime = 0;
2927: tp->t_rxtshift = 0;
2928:
2929: /*
2930: * the retransmit should happen at rtt + 4 * rttvar.
2931: * Because of the way we do the smoothing, srtt and rttvar
2932: * will each average +1/2 tick of bias. When we compute
2933: * the retransmit timer, we want 1/2 tick of rounding and
2934: * 1 extra tick because of +-1/2 tick uncertainty in the
2935: * firing of the timer. The bias will give us exactly the
2936: * 1.5 tick we need. But, because the bias is
2937: * statistical, we have to test that we don't drop below
2938: * the minimum feasible timer (which is 2 ticks).
2939: */
2940: rttmin = min(max(rtt + 2, tp->t_rttmin), TCPTV_REXMTMAX);
2941: TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX);
2942:
2943: /*
2944: * We received an ack for a packet that wasn't retransmitted;
2945: * it is probably safe to discard any error indications we've
2946: * received recently. This isn't quite right, but close enough
2947: * for now (a route might have failed after we sent a segment,
2948: * and the return path might not be symmetrical).
2949: */
2950: tp->t_softerror = 0;
2951: }
2952:
2953: /*
2954: * Determine a reasonable value for maxseg size.
2955: * If the route is known, check route for mtu.
2956: * If none, use an mss that can be handled on the outgoing
2957: * interface without forcing IP to fragment; if bigger than
2958: * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
2959: * to utilize large mbufs. If no route is found, route has no mtu,
2960: * or the destination isn't local, use a default, hopefully conservative
2961: * size (usually 512 or the default IP max size, but no more than the mtu
2962: * of the interface), as we can't discover anything about intervening
2963: * gateways or networks. We also initialize the congestion/slow start
2964: * window to be a single segment if the destination isn't local.
2965: * While looking at the routing entry, we also initialize other path-dependent
2966: * parameters from pre-set or cached values in the routing entry.
2967: *
2968: * Also take into account the space needed for options that we
2969: * send regularly. Make maxseg shorter by that amount to assure
2970: * that we can send maxseg amount of data even when the options
2971: * are present. Store the upper limit of the length of options plus
2972: * data in maxopd.
2973: *
2974: * NOTE: offer == -1 indicates that the maxseg size changed due to
2975: * Path MTU discovery.
2976: */
2977: int
2978: tcp_mss(tp, offer)
2979: struct tcpcb *tp;
2980: int offer;
2981: {
2982: struct rtentry *rt;
2983: struct ifnet *ifp;
2984: int mss, mssopt;
2985: int iphlen;
2986: struct inpcb *inp;
2987:
2988: inp = tp->t_inpcb;
2989:
2990: mssopt = mss = tcp_mssdflt;
2991:
2992: rt = in_pcbrtentry(inp);
2993:
2994: if (rt == NULL)
2995: goto out;
2996:
2997: ifp = rt->rt_ifp;
2998:
2999: switch (tp->pf) {
3000: #ifdef INET6
3001: case AF_INET6:
3002: iphlen = sizeof(struct ip6_hdr);
3003: break;
3004: #endif
3005: case AF_INET:
3006: iphlen = sizeof(struct ip);
3007: break;
3008: default:
3009: /* the family does not support path MTU discovery */
3010: goto out;
3011: }
3012:
3013: #ifdef RTV_MTU
3014: /*
3015: * if there's an mtu associated with the route and we support
3016: * path MTU discovery for the underlying protocol family, use it.
3017: */
3018: if (rt->rt_rmx.rmx_mtu) {
3019: /*
3020: * One may wish to lower MSS to take into account options,
3021: * especially security-related options.
3022: */
3023: if (tp->pf == AF_INET6 && rt->rt_rmx.rmx_mtu < IPV6_MMTU) {
3024: /*
3025: * RFC2460 section 5, last paragraph: if path MTU is
3026: * smaller than 1280, use 1280 as packet size and
3027: * attach fragment header.
3028: */
3029: mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) -
3030: sizeof(struct tcphdr);
3031: } else
3032: mss = rt->rt_rmx.rmx_mtu - iphlen - sizeof(struct tcphdr);
3033: } else
3034: #endif /* RTV_MTU */
3035: if (!ifp)
3036: /*
3037: * ifp may be null and rmx_mtu may be zero in certain
3038: * v6 cases (e.g., if ND wasn't able to resolve the
3039: * destination host.
3040: */
3041: goto out;
3042: else if (ifp->if_flags & IFF_LOOPBACK)
3043: mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
3044: else if (tp->pf == AF_INET) {
3045: if (ip_mtudisc)
3046: mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
3047: else if (inp && in_localaddr(inp->inp_faddr))
3048: mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
3049: }
3050: #ifdef INET6
3051: else if (tp->pf == AF_INET6) {
3052: /*
3053: * for IPv6, path MTU discovery is always turned on,
3054: * or the node must use packet size <= 1280.
3055: */
3056: mss = IN6_LINKMTU(ifp) - iphlen - sizeof(struct tcphdr);
3057: }
3058: #endif /* INET6 */
3059:
3060: /* Calculate the value that we offer in TCPOPT_MAXSEG */
3061: if (offer != -1) {
3062: #ifndef INET6
3063: mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
3064: #else
3065: if (tp->pf == AF_INET6)
3066: mssopt = IN6_LINKMTU(ifp) - iphlen -
3067: sizeof(struct tcphdr);
3068: else
3069: mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
3070: #endif
3071:
3072: mssopt = max(tcp_mssdflt, mssopt);
3073: }
3074:
3075: out:
3076: /*
3077: * The current mss, t_maxseg, is initialized to the default value.
3078: * If we compute a smaller value, reduce the current mss.
3079: * If we compute a larger value, return it for use in sending
3080: * a max seg size option, but don't store it for use
3081: * unless we received an offer at least that large from peer.
3082: *
3083: * However, do not accept offers lower than the minimum of
3084: * the interface MTU and 216.
3085: */
3086: if (offer > 0)
3087: tp->t_peermss = offer;
3088: if (tp->t_peermss)
3089: mss = min(mss, max(tp->t_peermss, 216));
3090:
3091: /* sanity - at least max opt. space */
3092: mss = max(mss, 64);
3093:
3094: /*
3095: * maxopd stores the maximum length of data AND options
3096: * in a segment; maxseg is the amount of data in a normal
3097: * segment. We need to store this value (maxopd) apart
3098: * from maxseg, because now every segment carries options
3099: * and thus we normally have somewhat less data in segments.
3100: */
3101: tp->t_maxopd = mss;
3102:
3103: if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
3104: (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
3105: mss -= TCPOLEN_TSTAMP_APPA;
3106: #ifdef TCP_SIGNATURE
3107: if (tp->t_flags & TF_SIGNATURE)
3108: mss -= TCPOLEN_SIGLEN;
3109: #endif
3110:
3111: if (offer == -1) {
3112: /* mss changed due to Path MTU discovery */
3113: tp->t_flags &= ~TF_PMTUD_PEND;
3114: tp->t_pmtud_mtu_sent = 0;
3115: tp->t_pmtud_mss_acked = 0;
3116: if (mss < tp->t_maxseg) {
3117: /*
3118: * Follow suggestion in RFC 2414 to reduce the
3119: * congestion window by the ratio of the old
3120: * segment size to the new segment size.
3121: */
3122: tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) *
3123: mss, mss);
3124: }
3125: } else if (tcp_do_rfc3390) {
3126: /* increase initial window */
3127: tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380));
3128: } else
3129: tp->snd_cwnd = mss;
3130:
3131: tp->t_maxseg = mss;
3132:
3133: return (offer != -1 ? mssopt : mss);
3134: }
3135:
3136: u_int
3137: tcp_hdrsz(struct tcpcb *tp)
3138: {
3139: u_int hlen;
3140:
3141: switch (tp->pf) {
3142: #ifdef INET6
3143: case AF_INET6:
3144: hlen = sizeof(struct ip6_hdr);
3145: break;
3146: #endif
3147: case AF_INET:
3148: hlen = sizeof(struct ip);
3149: break;
3150: default:
3151: hlen = 0;
3152: break;
3153: }
3154: hlen += sizeof(struct tcphdr);
3155:
3156: if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
3157: (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
3158: hlen += TCPOLEN_TSTAMP_APPA;
3159: #ifdef TCP_SIGNATURE
3160: if (tp->t_flags & TF_SIGNATURE)
3161: hlen += TCPOLEN_SIGLEN;
3162: #endif
3163: return (hlen);
3164: }
3165:
3166: /*
3167: * Set connection variables based on the effective MSS.
3168: * We are passed the TCPCB for the actual connection. If we
3169: * are the server, we are called by the compressed state engine
3170: * when the 3-way handshake is complete. If we are the client,
3171: * we are called when we receive the SYN,ACK from the server.
3172: *
3173: * NOTE: The t_maxseg value must be initialized in the TCPCB
3174: * before this routine is called!
3175: */
3176: void
3177: tcp_mss_update(tp)
3178: struct tcpcb *tp;
3179: {
3180: int mss;
3181: u_long bufsize;
3182: struct rtentry *rt;
3183: struct socket *so;
3184:
3185: so = tp->t_inpcb->inp_socket;
3186: mss = tp->t_maxseg;
3187:
3188: rt = in_pcbrtentry(tp->t_inpcb);
3189:
3190: if (rt == NULL)
3191: return;
3192:
3193: bufsize = so->so_snd.sb_hiwat;
3194: if (bufsize < mss) {
3195: mss = bufsize;
3196: /* Update t_maxseg and t_maxopd */
3197: tcp_mss(tp, mss);
3198: } else {
3199: bufsize = roundup(bufsize, mss);
3200: if (bufsize > sb_max)
3201: bufsize = sb_max;
3202: (void)sbreserve(&so->so_snd, bufsize);
3203: }
3204:
3205: bufsize = so->so_rcv.sb_hiwat;
3206: if (bufsize > mss) {
3207: bufsize = roundup(bufsize, mss);
3208: if (bufsize > sb_max)
3209: bufsize = sb_max;
3210: (void)sbreserve(&so->so_rcv, bufsize);
3211: }
3212:
3213: }
3214:
3215: #if defined (TCP_SACK)
3216: /*
3217: * Checks for partial ack. If partial ack arrives, force the retransmission
3218: * of the next unacknowledged segment, do not clear tp->t_dupacks, and return
3219: * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to
3220: * be started again. If the ack advances at least to tp->snd_last, return 0.
3221: */
3222: int
3223: tcp_newreno(tp, th)
3224: struct tcpcb *tp;
3225: struct tcphdr *th;
3226: {
3227: if (SEQ_LT(th->th_ack, tp->snd_last)) {
3228: /*
3229: * snd_una has not been updated and the socket send buffer
3230: * not yet drained of the acked data, so we have to leave
3231: * snd_una as it was to get the correct data offset in
3232: * tcp_output().
3233: */
3234: tcp_seq onxt = tp->snd_nxt;
3235: u_long ocwnd = tp->snd_cwnd;
3236: TCP_TIMER_DISARM(tp, TCPT_REXMT);
3237: tp->t_rtttime = 0;
3238: tp->snd_nxt = th->th_ack;
3239: /*
3240: * Set snd_cwnd to one segment beyond acknowledged offset
3241: * (tp->snd_una not yet updated when this function is called)
3242: */
3243: tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
3244: (void) tcp_output(tp);
3245: tp->snd_cwnd = ocwnd;
3246: if (SEQ_GT(onxt, tp->snd_nxt))
3247: tp->snd_nxt = onxt;
3248: /*
3249: * Partial window deflation. Relies on fact that tp->snd_una
3250: * not updated yet.
3251: */
3252: if (tp->snd_cwnd > th->th_ack - tp->snd_una)
3253: tp->snd_cwnd -= th->th_ack - tp->snd_una;
3254: else
3255: tp->snd_cwnd = 0;
3256: tp->snd_cwnd += tp->t_maxseg;
3257:
3258: return 1;
3259: }
3260: return 0;
3261: }
3262: #endif /* TCP_SACK */
3263:
3264: int
3265: tcp_mss_adv(struct ifnet *ifp, int af)
3266: {
3267: int mss = 0;
3268: int iphlen;
3269:
3270: switch (af) {
3271: case AF_INET:
3272: if (ifp != NULL)
3273: mss = ifp->if_mtu;
3274: iphlen = sizeof(struct ip);
3275: break;
3276: #ifdef INET6
3277: case AF_INET6:
3278: if (ifp != NULL)
3279: mss = IN6_LINKMTU(ifp);
3280: iphlen = sizeof(struct ip6_hdr);
3281: break;
3282: #endif
3283: }
3284: mss = mss - iphlen - sizeof(struct tcphdr);
3285: return (max(mss, tcp_mssdflt));
3286: }
3287:
3288: /*
3289: * TCP compressed state engine. Currently used to hold compressed
3290: * state for SYN_RECEIVED.
3291: */
3292:
3293: u_long syn_cache_count;
3294: u_int32_t syn_hash1, syn_hash2;
3295:
3296: #define SYN_HASH(sa, sp, dp) \
3297: ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \
3298: ((u_int32_t)(sp)))^syn_hash2)))
3299: #ifndef INET6
3300: #define SYN_HASHALL(hash, src, dst) \
3301: do { \
3302: hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \
3303: ((struct sockaddr_in *)(src))->sin_port, \
3304: ((struct sockaddr_in *)(dst))->sin_port); \
3305: } while (/*CONSTCOND*/ 0)
3306: #else
3307: #define SYN_HASH6(sa, sp, dp) \
3308: ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \
3309: (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \
3310: & 0x7fffffff)
3311:
3312: #define SYN_HASHALL(hash, src, dst) \
3313: do { \
3314: switch ((src)->sa_family) { \
3315: case AF_INET: \
3316: hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \
3317: ((struct sockaddr_in *)(src))->sin_port, \
3318: ((struct sockaddr_in *)(dst))->sin_port); \
3319: break; \
3320: case AF_INET6: \
3321: hash = SYN_HASH6(&((struct sockaddr_in6 *)(src))->sin6_addr, \
3322: ((struct sockaddr_in6 *)(src))->sin6_port, \
3323: ((struct sockaddr_in6 *)(dst))->sin6_port); \
3324: break; \
3325: default: \
3326: hash = 0; \
3327: } \
3328: } while (/*CONSTCOND*/0)
3329: #endif /* INET6 */
3330:
3331: #define SYN_CACHE_RM(sc) \
3332: do { \
3333: (sc)->sc_flags |= SCF_DEAD; \
3334: TAILQ_REMOVE(&tcp_syn_cache[(sc)->sc_bucketidx].sch_bucket, \
3335: (sc), sc_bucketq); \
3336: (sc)->sc_tp = NULL; \
3337: LIST_REMOVE((sc), sc_tpq); \
3338: tcp_syn_cache[(sc)->sc_bucketidx].sch_length--; \
3339: timeout_del(&(sc)->sc_timer); \
3340: syn_cache_count--; \
3341: } while (/*CONSTCOND*/0)
3342:
3343: #define SYN_CACHE_PUT(sc) \
3344: do { \
3345: if ((sc)->sc_ipopts) \
3346: (void) m_free((sc)->sc_ipopts); \
3347: if ((sc)->sc_route4.ro_rt != NULL) \
3348: RTFREE((sc)->sc_route4.ro_rt); \
3349: timeout_set(&(sc)->sc_timer, syn_cache_reaper, (sc)); \
3350: timeout_add(&(sc)->sc_timer, 0); \
3351: } while (/*CONSTCOND*/0)
3352:
3353: struct pool syn_cache_pool;
3354:
3355: /*
3356: * We don't estimate RTT with SYNs, so each packet starts with the default
3357: * RTT and each timer step has a fixed timeout value.
3358: */
3359: #define SYN_CACHE_TIMER_ARM(sc) \
3360: do { \
3361: TCPT_RANGESET((sc)->sc_rxtcur, \
3362: TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \
3363: TCPTV_REXMTMAX); \
3364: if (!timeout_initialized(&(sc)->sc_timer)) \
3365: timeout_set(&(sc)->sc_timer, syn_cache_timer, (sc)); \
3366: timeout_add(&(sc)->sc_timer, (sc)->sc_rxtcur * (hz / PR_SLOWHZ)); \
3367: } while (/*CONSTCOND*/0)
3368:
3369: #define SYN_CACHE_TIMESTAMP(sc) tcp_now + (sc)->sc_modulate
3370:
3371: void
3372: syn_cache_init()
3373: {
3374: int i;
3375:
3376: /* Initialize the hash buckets. */
3377: for (i = 0; i < tcp_syn_cache_size; i++)
3378: TAILQ_INIT(&tcp_syn_cache[i].sch_bucket);
3379:
3380: /* Initialize the syn cache pool. */
3381: pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0,
3382: "synpl", NULL);
3383: }
3384:
3385: void
3386: syn_cache_insert(sc, tp)
3387: struct syn_cache *sc;
3388: struct tcpcb *tp;
3389: {
3390: struct syn_cache_head *scp;
3391: struct syn_cache *sc2;
3392: int s;
3393:
3394: /*
3395: * If there are no entries in the hash table, reinitialize
3396: * the hash secrets.
3397: */
3398: if (syn_cache_count == 0) {
3399: syn_hash1 = arc4random();
3400: syn_hash2 = arc4random();
3401: }
3402:
3403: SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa);
3404: sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size;
3405: scp = &tcp_syn_cache[sc->sc_bucketidx];
3406:
3407: /*
3408: * Make sure that we don't overflow the per-bucket
3409: * limit or the total cache size limit.
3410: */
3411: s = splsoftnet();
3412: if (scp->sch_length >= tcp_syn_bucket_limit) {
3413: tcpstat.tcps_sc_bucketoverflow++;
3414: /*
3415: * The bucket is full. Toss the oldest element in the
3416: * bucket. This will be the first entry in the bucket.
3417: */
3418: sc2 = TAILQ_FIRST(&scp->sch_bucket);
3419: #ifdef DIAGNOSTIC
3420: /*
3421: * This should never happen; we should always find an
3422: * entry in our bucket.
3423: */
3424: if (sc2 == NULL)
3425: panic("syn_cache_insert: bucketoverflow: impossible");
3426: #endif
3427: SYN_CACHE_RM(sc2);
3428: SYN_CACHE_PUT(sc2);
3429: } else if (syn_cache_count >= tcp_syn_cache_limit) {
3430: struct syn_cache_head *scp2, *sce;
3431:
3432: tcpstat.tcps_sc_overflowed++;
3433: /*
3434: * The cache is full. Toss the oldest entry in the
3435: * first non-empty bucket we can find.
3436: *
3437: * XXX We would really like to toss the oldest
3438: * entry in the cache, but we hope that this
3439: * condition doesn't happen very often.
3440: */
3441: scp2 = scp;
3442: if (TAILQ_EMPTY(&scp2->sch_bucket)) {
3443: sce = &tcp_syn_cache[tcp_syn_cache_size];
3444: for (++scp2; scp2 != scp; scp2++) {
3445: if (scp2 >= sce)
3446: scp2 = &tcp_syn_cache[0];
3447: if (! TAILQ_EMPTY(&scp2->sch_bucket))
3448: break;
3449: }
3450: #ifdef DIAGNOSTIC
3451: /*
3452: * This should never happen; we should always find a
3453: * non-empty bucket.
3454: */
3455: if (scp2 == scp)
3456: panic("syn_cache_insert: cacheoverflow: "
3457: "impossible");
3458: #endif
3459: }
3460: sc2 = TAILQ_FIRST(&scp2->sch_bucket);
3461: SYN_CACHE_RM(sc2);
3462: SYN_CACHE_PUT(sc2);
3463: }
3464:
3465: /*
3466: * Initialize the entry's timer.
3467: */
3468: sc->sc_rxttot = 0;
3469: sc->sc_rxtshift = 0;
3470: SYN_CACHE_TIMER_ARM(sc);
3471:
3472: /* Link it from tcpcb entry */
3473: LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq);
3474:
3475: /* Put it into the bucket. */
3476: TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq);
3477: scp->sch_length++;
3478: syn_cache_count++;
3479:
3480: tcpstat.tcps_sc_added++;
3481: splx(s);
3482: }
3483:
3484: /*
3485: * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
3486: * If we have retransmitted an entry the maximum number of times, expire
3487: * that entry.
3488: */
3489: void
3490: syn_cache_timer(void *arg)
3491: {
3492: struct syn_cache *sc = arg;
3493: int s;
3494:
3495: s = splsoftnet();
3496: if (sc->sc_flags & SCF_DEAD) {
3497: splx(s);
3498: return;
3499: }
3500:
3501: if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) {
3502: /* Drop it -- too many retransmissions. */
3503: goto dropit;
3504: }
3505:
3506: /*
3507: * Compute the total amount of time this entry has
3508: * been on a queue. If this entry has been on longer
3509: * than the keep alive timer would allow, expire it.
3510: */
3511: sc->sc_rxttot += sc->sc_rxtcur;
3512: if (sc->sc_rxttot >= tcptv_keep_init)
3513: goto dropit;
3514:
3515: tcpstat.tcps_sc_retransmitted++;
3516: (void) syn_cache_respond(sc, NULL);
3517:
3518: /* Advance the timer back-off. */
3519: sc->sc_rxtshift++;
3520: SYN_CACHE_TIMER_ARM(sc);
3521:
3522: splx(s);
3523: return;
3524:
3525: dropit:
3526: tcpstat.tcps_sc_timed_out++;
3527: SYN_CACHE_RM(sc);
3528: SYN_CACHE_PUT(sc);
3529: splx(s);
3530: }
3531:
3532: void
3533: syn_cache_reaper(void *arg)
3534: {
3535: struct syn_cache *sc = arg;
3536: int s;
3537:
3538: s = splsoftnet();
3539: pool_put(&syn_cache_pool, (sc));
3540: splx(s);
3541: return;
3542: }
3543:
3544: /*
3545: * Remove syn cache created by the specified tcb entry,
3546: * because this does not make sense to keep them
3547: * (if there's no tcb entry, syn cache entry will never be used)
3548: */
3549: void
3550: syn_cache_cleanup(tp)
3551: struct tcpcb *tp;
3552: {
3553: struct syn_cache *sc, *nsc;
3554: int s;
3555:
3556: s = splsoftnet();
3557:
3558: for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) {
3559: nsc = LIST_NEXT(sc, sc_tpq);
3560:
3561: #ifdef DIAGNOSTIC
3562: if (sc->sc_tp != tp)
3563: panic("invalid sc_tp in syn_cache_cleanup");
3564: #endif
3565: SYN_CACHE_RM(sc);
3566: SYN_CACHE_PUT(sc);
3567: }
3568: /* just for safety */
3569: LIST_INIT(&tp->t_sc);
3570:
3571: splx(s);
3572: }
3573:
3574: /*
3575: * Find an entry in the syn cache.
3576: */
3577: struct syn_cache *
3578: syn_cache_lookup(src, dst, headp)
3579: struct sockaddr *src;
3580: struct sockaddr *dst;
3581: struct syn_cache_head **headp;
3582: {
3583: struct syn_cache *sc;
3584: struct syn_cache_head *scp;
3585: u_int32_t hash;
3586: int s;
3587:
3588: SYN_HASHALL(hash, src, dst);
3589:
3590: scp = &tcp_syn_cache[hash % tcp_syn_cache_size];
3591: *headp = scp;
3592: s = splsoftnet();
3593: for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL;
3594: sc = TAILQ_NEXT(sc, sc_bucketq)) {
3595: if (sc->sc_hash != hash)
3596: continue;
3597: if (!bcmp(&sc->sc_src, src, src->sa_len) &&
3598: !bcmp(&sc->sc_dst, dst, dst->sa_len)) {
3599: splx(s);
3600: return (sc);
3601: }
3602: }
3603: splx(s);
3604: return (NULL);
3605: }
3606:
3607: /*
3608: * This function gets called when we receive an ACK for a
3609: * socket in the LISTEN state. We look up the connection
3610: * in the syn cache, and if its there, we pull it out of
3611: * the cache and turn it into a full-blown connection in
3612: * the SYN-RECEIVED state.
3613: *
3614: * The return values may not be immediately obvious, and their effects
3615: * can be subtle, so here they are:
3616: *
3617: * NULL SYN was not found in cache; caller should drop the
3618: * packet and send an RST.
3619: *
3620: * -1 We were unable to create the new connection, and are
3621: * aborting it. An ACK,RST is being sent to the peer
3622: * (unless we got screwey sequence numbners; see below),
3623: * because the 3-way handshake has been completed. Caller
3624: * should not free the mbuf, since we may be using it. If
3625: * we are not, we will free it.
3626: *
3627: * Otherwise, the return value is a pointer to the new socket
3628: * associated with the connection.
3629: */
3630: struct socket *
3631: syn_cache_get(src, dst, th, hlen, tlen, so, m)
3632: struct sockaddr *src;
3633: struct sockaddr *dst;
3634: struct tcphdr *th;
3635: unsigned int hlen, tlen;
3636: struct socket *so;
3637: struct mbuf *m;
3638: {
3639: struct syn_cache *sc;
3640: struct syn_cache_head *scp;
3641: struct inpcb *inp = NULL;
3642: struct tcpcb *tp = 0;
3643: struct mbuf *am;
3644: int s;
3645: struct socket *oso;
3646:
3647: s = splsoftnet();
3648: if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
3649: splx(s);
3650: return (NULL);
3651: }
3652:
3653: /*
3654: * Verify the sequence and ack numbers. Try getting the correct
3655: * response again.
3656: */
3657: if ((th->th_ack != sc->sc_iss + 1) ||
3658: SEQ_LEQ(th->th_seq, sc->sc_irs) ||
3659: SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) {
3660: (void) syn_cache_respond(sc, m);
3661: splx(s);
3662: return ((struct socket *)(-1));
3663: }
3664:
3665: /* Remove this cache entry */
3666: SYN_CACHE_RM(sc);
3667: splx(s);
3668:
3669: /*
3670: * Ok, create the full blown connection, and set things up
3671: * as they would have been set up if we had created the
3672: * connection when the SYN arrived. If we can't create
3673: * the connection, abort it.
3674: */
3675: oso = so;
3676: so = sonewconn(so, SS_ISCONNECTED);
3677: if (so == NULL)
3678: goto resetandabort;
3679:
3680: inp = sotoinpcb(oso);
3681: #ifdef IPSEC
3682: /*
3683: * We need to copy the required security levels
3684: * from the old pcb. Ditto for any other
3685: * IPsec-related information.
3686: */
3687: {
3688: struct inpcb *newinp = (struct inpcb *)so->so_pcb;
3689: bcopy(inp->inp_seclevel, newinp->inp_seclevel,
3690: sizeof(inp->inp_seclevel));
3691: newinp->inp_secrequire = inp->inp_secrequire;
3692: if (inp->inp_ipo != NULL) {
3693: newinp->inp_ipo = inp->inp_ipo;
3694: inp->inp_ipo->ipo_ref_count++;
3695: }
3696: if (inp->inp_ipsec_remotecred != NULL) {
3697: newinp->inp_ipsec_remotecred = inp->inp_ipsec_remotecred;
3698: inp->inp_ipsec_remotecred->ref_count++;
3699: }
3700: if (inp->inp_ipsec_remoteauth != NULL) {
3701: newinp->inp_ipsec_remoteauth
3702: = inp->inp_ipsec_remoteauth;
3703: inp->inp_ipsec_remoteauth->ref_count++;
3704: }
3705: }
3706: #endif /* IPSEC */
3707: #ifdef INET6
3708: /*
3709: * inp still has the OLD in_pcb stuff, set the
3710: * v6-related flags on the new guy, too.
3711: */
3712: {
3713: int flags = inp->inp_flags;
3714: struct inpcb *oldinpcb = inp;
3715:
3716: inp = (struct inpcb *)so->so_pcb;
3717: inp->inp_flags |= (flags & INP_IPV6);
3718: if ((inp->inp_flags & INP_IPV6) != 0) {
3719: inp->inp_ipv6.ip6_hlim =
3720: oldinpcb->inp_ipv6.ip6_hlim;
3721: }
3722: }
3723: #else /* INET6 */
3724: inp = (struct inpcb *)so->so_pcb;
3725: #endif /* INET6 */
3726:
3727: inp->inp_lport = th->th_dport;
3728: switch (src->sa_family) {
3729: #ifdef INET6
3730: case AF_INET6:
3731: inp->inp_laddr6 = ((struct sockaddr_in6 *)dst)->sin6_addr;
3732: break;
3733: #endif /* INET6 */
3734: case AF_INET:
3735:
3736: inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr;
3737: inp->inp_options = ip_srcroute();
3738: if (inp->inp_options == NULL) {
3739: inp->inp_options = sc->sc_ipopts;
3740: sc->sc_ipopts = NULL;
3741: }
3742: break;
3743: }
3744: in_pcbrehash(inp);
3745:
3746: /*
3747: * Give the new socket our cached route reference.
3748: */
3749: if (src->sa_family == AF_INET)
3750: inp->inp_route = sc->sc_route4; /* struct assignment */
3751: #ifdef INET6
3752: else
3753: inp->inp_route6 = sc->sc_route6;
3754: #endif
3755: sc->sc_route4.ro_rt = NULL;
3756:
3757: am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */
3758: if (am == NULL)
3759: goto resetandabort;
3760: am->m_len = src->sa_len;
3761: bcopy(src, mtod(am, caddr_t), src->sa_len);
3762:
3763: switch (src->sa_family) {
3764: case AF_INET:
3765: /* drop IPv4 packet to AF_INET6 socket */
3766: if (inp->inp_flags & INP_IPV6) {
3767: (void) m_free(am);
3768: goto resetandabort;
3769: }
3770: if (in_pcbconnect(inp, am)) {
3771: (void) m_free(am);
3772: goto resetandabort;
3773: }
3774: break;
3775: #ifdef INET6
3776: case AF_INET6:
3777: if (in6_pcbconnect(inp, am)) {
3778: (void) m_free(am);
3779: goto resetandabort;
3780: }
3781: break;
3782: #endif
3783: }
3784: (void) m_free(am);
3785:
3786: tp = intotcpcb(inp);
3787: tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY;
3788: if (sc->sc_request_r_scale != 15) {
3789: tp->requested_s_scale = sc->sc_requested_s_scale;
3790: tp->request_r_scale = sc->sc_request_r_scale;
3791: tp->snd_scale = sc->sc_requested_s_scale;
3792: tp->rcv_scale = sc->sc_request_r_scale;
3793: tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
3794: }
3795: if (sc->sc_flags & SCF_TIMESTAMP)
3796: tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
3797:
3798: tp->t_template = tcp_template(tp);
3799: if (tp->t_template == 0) {
3800: tp = tcp_drop(tp, ENOBUFS); /* destroys socket */
3801: so = NULL;
3802: m_freem(m);
3803: goto abort;
3804: }
3805: #ifdef TCP_SACK
3806: tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT;
3807: #endif
3808:
3809: tp->ts_modulate = sc->sc_modulate;
3810: tp->iss = sc->sc_iss;
3811: tp->irs = sc->sc_irs;
3812: tcp_sendseqinit(tp);
3813: #if defined (TCP_SACK) || defined(TCP_ECN)
3814: tp->snd_last = tp->snd_una;
3815: #endif /* TCP_SACK */
3816: #if defined(TCP_SACK) && defined(TCP_FACK)
3817: tp->snd_fack = tp->snd_una;
3818: tp->retran_data = 0;
3819: tp->snd_awnd = 0;
3820: #endif /* TCP_FACK */
3821: #ifdef TCP_ECN
3822: if (sc->sc_flags & SCF_ECN_PERMIT) {
3823: tp->t_flags |= TF_ECN_PERMIT;
3824: tcpstat.tcps_ecn_accepts++;
3825: }
3826: #endif
3827: #ifdef TCP_SACK
3828: if (sc->sc_flags & SCF_SACK_PERMIT)
3829: tp->t_flags |= TF_SACK_PERMIT;
3830: #endif
3831: #ifdef TCP_SIGNATURE
3832: if (sc->sc_flags & SCF_SIGNATURE)
3833: tp->t_flags |= TF_SIGNATURE;
3834: #endif
3835: tcp_rcvseqinit(tp);
3836: tp->t_state = TCPS_SYN_RECEIVED;
3837: tp->t_rcvtime = tcp_now;
3838: TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init);
3839: tcpstat.tcps_accepts++;
3840:
3841: tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */
3842: if (sc->sc_peermaxseg)
3843: tcp_mss_update(tp);
3844: /* Reset initial window to 1 segment for retransmit */
3845: if (sc->sc_rxtshift > 0)
3846: tp->snd_cwnd = tp->t_maxseg;
3847: tp->snd_wl1 = sc->sc_irs;
3848: tp->rcv_up = sc->sc_irs + 1;
3849:
3850: /*
3851: * This is what whould have happened in tcp_output() when
3852: * the SYN,ACK was sent.
3853: */
3854: tp->snd_up = tp->snd_una;
3855: tp->snd_max = tp->snd_nxt = tp->iss+1;
3856: TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
3857: if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv))
3858: tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
3859: tp->last_ack_sent = tp->rcv_nxt;
3860:
3861: tcpstat.tcps_sc_completed++;
3862: SYN_CACHE_PUT(sc);
3863: return (so);
3864:
3865: resetandabort:
3866: tcp_respond(NULL, mtod(m, caddr_t), m, (tcp_seq)0, th->th_ack, TH_RST);
3867: abort:
3868: if (so != NULL)
3869: (void) soabort(so);
3870: SYN_CACHE_PUT(sc);
3871: tcpstat.tcps_sc_aborted++;
3872: return ((struct socket *)(-1));
3873: }
3874:
3875: /*
3876: * This function is called when we get a RST for a
3877: * non-existent connection, so that we can see if the
3878: * connection is in the syn cache. If it is, zap it.
3879: */
3880:
3881: void
3882: syn_cache_reset(src, dst, th)
3883: struct sockaddr *src;
3884: struct sockaddr *dst;
3885: struct tcphdr *th;
3886: {
3887: struct syn_cache *sc;
3888: struct syn_cache_head *scp;
3889: int s = splsoftnet();
3890:
3891: if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
3892: splx(s);
3893: return;
3894: }
3895: if (SEQ_LT(th->th_seq, sc->sc_irs) ||
3896: SEQ_GT(th->th_seq, sc->sc_irs+1)) {
3897: splx(s);
3898: return;
3899: }
3900: SYN_CACHE_RM(sc);
3901: splx(s);
3902: tcpstat.tcps_sc_reset++;
3903: SYN_CACHE_PUT(sc);
3904: }
3905:
3906: void
3907: syn_cache_unreach(src, dst, th)
3908: struct sockaddr *src;
3909: struct sockaddr *dst;
3910: struct tcphdr *th;
3911: {
3912: struct syn_cache *sc;
3913: struct syn_cache_head *scp;
3914: int s;
3915:
3916: s = splsoftnet();
3917: if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
3918: splx(s);
3919: return;
3920: }
3921: /* If the sequence number != sc_iss, then it's a bogus ICMP msg */
3922: if (ntohl (th->th_seq) != sc->sc_iss) {
3923: splx(s);
3924: return;
3925: }
3926:
3927: /*
3928: * If we've retransmitted 3 times and this is our second error,
3929: * we remove the entry. Otherwise, we allow it to continue on.
3930: * This prevents us from incorrectly nuking an entry during a
3931: * spurious network outage.
3932: *
3933: * See tcp_notify().
3934: */
3935: if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) {
3936: sc->sc_flags |= SCF_UNREACH;
3937: splx(s);
3938: return;
3939: }
3940:
3941: SYN_CACHE_RM(sc);
3942: splx(s);
3943: tcpstat.tcps_sc_unreach++;
3944: SYN_CACHE_PUT(sc);
3945: }
3946:
3947: /*
3948: * Given a LISTEN socket and an inbound SYN request, add
3949: * this to the syn cache, and send back a segment:
3950: * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
3951: * to the source.
3952: *
3953: * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
3954: * Doing so would require that we hold onto the data and deliver it
3955: * to the application. However, if we are the target of a SYN-flood
3956: * DoS attack, an attacker could send data which would eventually
3957: * consume all available buffer space if it were ACKed. By not ACKing
3958: * the data, we avoid this DoS scenario.
3959: */
3960:
3961: int
3962: syn_cache_add(src, dst, th, iphlen, so, m, optp, optlen, oi, issp)
3963: struct sockaddr *src;
3964: struct sockaddr *dst;
3965: struct tcphdr *th;
3966: unsigned int iphlen;
3967: struct socket *so;
3968: struct mbuf *m;
3969: u_char *optp;
3970: int optlen;
3971: struct tcp_opt_info *oi;
3972: tcp_seq *issp;
3973: {
3974: struct tcpcb tb, *tp;
3975: long win;
3976: struct syn_cache *sc;
3977: struct syn_cache_head *scp;
3978: struct mbuf *ipopts;
3979:
3980: tp = sototcpcb(so);
3981:
3982: /*
3983: * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
3984: *
3985: * Note this check is performed in tcp_input() very early on.
3986: */
3987:
3988: /*
3989: * Initialize some local state.
3990: */
3991: win = sbspace(&so->so_rcv);
3992: if (win > TCP_MAXWIN)
3993: win = TCP_MAXWIN;
3994:
3995: #ifdef TCP_SIGNATURE
3996: if (optp || (tp->t_flags & TF_SIGNATURE)) {
3997: #else
3998: if (optp) {
3999: #endif
4000: tb.pf = tp->pf;
4001: #ifdef TCP_SACK
4002: tb.sack_enable = tp->sack_enable;
4003: #endif
4004: tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
4005: #ifdef TCP_SIGNATURE
4006: if (tp->t_flags & TF_SIGNATURE)
4007: tb.t_flags |= TF_SIGNATURE;
4008: #endif
4009: tb.t_state = TCPS_LISTEN;
4010: if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi))
4011: return (0);
4012: } else
4013: tb.t_flags = 0;
4014:
4015: switch (src->sa_family) {
4016: #ifdef INET
4017: case AF_INET:
4018: /*
4019: * Remember the IP options, if any.
4020: */
4021: ipopts = ip_srcroute();
4022: break;
4023: #endif
4024: default:
4025: ipopts = NULL;
4026: }
4027:
4028: /*
4029: * See if we already have an entry for this connection.
4030: * If we do, resend the SYN,ACK. We do not count this
4031: * as a retransmission (XXX though maybe we should).
4032: */
4033: if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) {
4034: tcpstat.tcps_sc_dupesyn++;
4035: if (ipopts) {
4036: /*
4037: * If we were remembering a previous source route,
4038: * forget it and use the new one we've been given.
4039: */
4040: if (sc->sc_ipopts)
4041: (void) m_free(sc->sc_ipopts);
4042: sc->sc_ipopts = ipopts;
4043: }
4044: sc->sc_timestamp = tb.ts_recent;
4045: if (syn_cache_respond(sc, m) == 0) {
4046: tcpstat.tcps_sndacks++;
4047: tcpstat.tcps_sndtotal++;
4048: }
4049: return (1);
4050: }
4051:
4052: sc = pool_get(&syn_cache_pool, PR_NOWAIT);
4053: if (sc == NULL) {
4054: if (ipopts)
4055: (void) m_free(ipopts);
4056: return (0);
4057: }
4058:
4059: /*
4060: * Fill in the cache, and put the necessary IP and TCP
4061: * options into the reply.
4062: */
4063: bzero(sc, sizeof(struct syn_cache));
4064: bzero(&sc->sc_timer, sizeof(sc->sc_timer));
4065: bcopy(src, &sc->sc_src, src->sa_len);
4066: bcopy(dst, &sc->sc_dst, dst->sa_len);
4067: sc->sc_flags = 0;
4068: sc->sc_ipopts = ipopts;
4069: sc->sc_irs = th->th_seq;
4070:
4071: #ifdef TCP_COMPAT_42
4072: tcp_iss += TCP_ISSINCR/2;
4073: sc->sc_iss = tcp_iss;
4074: #else
4075: sc->sc_iss = issp ? *issp : arc4random();
4076: #endif
4077: sc->sc_peermaxseg = oi->maxseg;
4078: sc->sc_ourmaxseg = tcp_mss_adv(m->m_flags & M_PKTHDR ?
4079: m->m_pkthdr.rcvif : NULL, sc->sc_src.sa.sa_family);
4080: sc->sc_win = win;
4081: sc->sc_timestamp = tb.ts_recent;
4082: if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) ==
4083: (TF_REQ_TSTMP|TF_RCVD_TSTMP)) {
4084: sc->sc_flags |= SCF_TIMESTAMP;
4085: sc->sc_modulate = arc4random();
4086: }
4087: if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
4088: (TF_RCVD_SCALE|TF_REQ_SCALE)) {
4089: sc->sc_requested_s_scale = tb.requested_s_scale;
4090: sc->sc_request_r_scale = 0;
4091: while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT &&
4092: TCP_MAXWIN << sc->sc_request_r_scale <
4093: so->so_rcv.sb_hiwat)
4094: sc->sc_request_r_scale++;
4095: } else {
4096: sc->sc_requested_s_scale = 15;
4097: sc->sc_request_r_scale = 15;
4098: }
4099: #ifdef TCP_ECN
4100: /*
4101: * if both ECE and CWR flag bits are set, peer is ECN capable.
4102: */
4103: if (tcp_do_ecn &&
4104: (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR))
4105: sc->sc_flags |= SCF_ECN_PERMIT;
4106: #endif
4107: #ifdef TCP_SACK
4108: /*
4109: * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option
4110: * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT).
4111: */
4112: if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT))
4113: sc->sc_flags |= SCF_SACK_PERMIT;
4114: #endif
4115: #ifdef TCP_SIGNATURE
4116: if (tb.t_flags & TF_SIGNATURE)
4117: sc->sc_flags |= SCF_SIGNATURE;
4118: #endif
4119: sc->sc_tp = tp;
4120: if (syn_cache_respond(sc, m) == 0) {
4121: syn_cache_insert(sc, tp);
4122: tcpstat.tcps_sndacks++;
4123: tcpstat.tcps_sndtotal++;
4124: } else {
4125: SYN_CACHE_PUT(sc);
4126: tcpstat.tcps_sc_dropped++;
4127: }
4128: return (1);
4129: }
4130:
4131: int
4132: syn_cache_respond(sc, m)
4133: struct syn_cache *sc;
4134: struct mbuf *m;
4135: {
4136: struct route *ro;
4137: u_int8_t *optp;
4138: int optlen, error;
4139: u_int16_t tlen;
4140: struct ip *ip = NULL;
4141: #ifdef INET6
4142: struct ip6_hdr *ip6 = NULL;
4143: #endif
4144: struct tcphdr *th;
4145: u_int hlen;
4146: struct inpcb *inp;
4147:
4148: switch (sc->sc_src.sa.sa_family) {
4149: case AF_INET:
4150: hlen = sizeof(struct ip);
4151: ro = &sc->sc_route4;
4152: break;
4153: #ifdef INET6
4154: case AF_INET6:
4155: hlen = sizeof(struct ip6_hdr);
4156: ro = (struct route *)&sc->sc_route6;
4157: break;
4158: #endif
4159: default:
4160: if (m)
4161: m_freem(m);
4162: return (EAFNOSUPPORT);
4163: }
4164:
4165: /* Compute the size of the TCP options. */
4166: optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) +
4167: #ifdef TCP_SACK
4168: ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) +
4169: #endif
4170: #ifdef TCP_SIGNATURE
4171: ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) +
4172: #endif
4173: ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0);
4174:
4175: tlen = hlen + sizeof(struct tcphdr) + optlen;
4176:
4177: /*
4178: * Create the IP+TCP header from scratch.
4179: */
4180: if (m)
4181: m_freem(m);
4182: #ifdef DIAGNOSTIC
4183: if (max_linkhdr + tlen > MCLBYTES)
4184: return (ENOBUFS);
4185: #endif
4186: MGETHDR(m, M_DONTWAIT, MT_DATA);
4187: if (m && max_linkhdr + tlen > MHLEN) {
4188: MCLGET(m, M_DONTWAIT);
4189: if ((m->m_flags & M_EXT) == 0) {
4190: m_freem(m);
4191: m = NULL;
4192: }
4193: }
4194: if (m == NULL)
4195: return (ENOBUFS);
4196:
4197: /* Fixup the mbuf. */
4198: m->m_data += max_linkhdr;
4199: m->m_len = m->m_pkthdr.len = tlen;
4200: m->m_pkthdr.rcvif = NULL;
4201: memset(mtod(m, u_char *), 0, tlen);
4202:
4203: switch (sc->sc_src.sa.sa_family) {
4204: case AF_INET:
4205: ip = mtod(m, struct ip *);
4206: ip->ip_dst = sc->sc_src.sin.sin_addr;
4207: ip->ip_src = sc->sc_dst.sin.sin_addr;
4208: ip->ip_p = IPPROTO_TCP;
4209: th = (struct tcphdr *)(ip + 1);
4210: th->th_dport = sc->sc_src.sin.sin_port;
4211: th->th_sport = sc->sc_dst.sin.sin_port;
4212: break;
4213: #ifdef INET6
4214: case AF_INET6:
4215: ip6 = mtod(m, struct ip6_hdr *);
4216: ip6->ip6_dst = sc->sc_src.sin6.sin6_addr;
4217: ip6->ip6_src = sc->sc_dst.sin6.sin6_addr;
4218: ip6->ip6_nxt = IPPROTO_TCP;
4219: /* ip6_plen will be updated in ip6_output() */
4220: th = (struct tcphdr *)(ip6 + 1);
4221: th->th_dport = sc->sc_src.sin6.sin6_port;
4222: th->th_sport = sc->sc_dst.sin6.sin6_port;
4223: break;
4224: #endif
4225: default:
4226: th = NULL;
4227: }
4228:
4229: th->th_seq = htonl(sc->sc_iss);
4230: th->th_ack = htonl(sc->sc_irs + 1);
4231: th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
4232: th->th_flags = TH_SYN|TH_ACK;
4233: #ifdef TCP_ECN
4234: /* Set ECE for SYN-ACK if peer supports ECN. */
4235: if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT))
4236: th->th_flags |= TH_ECE;
4237: #endif
4238: th->th_win = htons(sc->sc_win);
4239: /* th_sum already 0 */
4240: /* th_urp already 0 */
4241:
4242: /* Tack on the TCP options. */
4243: optp = (u_int8_t *)(th + 1);
4244: *optp++ = TCPOPT_MAXSEG;
4245: *optp++ = 4;
4246: *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff;
4247: *optp++ = sc->sc_ourmaxseg & 0xff;
4248:
4249: #ifdef TCP_SACK
4250: /* Include SACK_PERMIT_HDR option if peer has already done so. */
4251: if (sc->sc_flags & SCF_SACK_PERMIT) {
4252: *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR);
4253: optp += 4;
4254: }
4255: #endif
4256:
4257: if (sc->sc_request_r_scale != 15) {
4258: *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
4259: TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
4260: sc->sc_request_r_scale);
4261: optp += 4;
4262: }
4263:
4264: if (sc->sc_flags & SCF_TIMESTAMP) {
4265: u_int32_t *lp = (u_int32_t *)(optp);
4266: /* Form timestamp option as shown in appendix A of RFC 1323. */
4267: *lp++ = htonl(TCPOPT_TSTAMP_HDR);
4268: *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc));
4269: *lp = htonl(sc->sc_timestamp);
4270: optp += TCPOLEN_TSTAMP_APPA;
4271: }
4272:
4273: #ifdef TCP_SIGNATURE
4274: if (sc->sc_flags & SCF_SIGNATURE) {
4275: union sockaddr_union src, dst;
4276: struct tdb *tdb;
4277:
4278: bzero(&src, sizeof(union sockaddr_union));
4279: bzero(&dst, sizeof(union sockaddr_union));
4280: src.sa.sa_len = sc->sc_src.sa.sa_len;
4281: src.sa.sa_family = sc->sc_src.sa.sa_family;
4282: dst.sa.sa_len = sc->sc_dst.sa.sa_len;
4283: dst.sa.sa_family = sc->sc_dst.sa.sa_family;
4284:
4285: switch (sc->sc_src.sa.sa_family) {
4286: case 0: /*default to PF_INET*/
4287: #ifdef INET
4288: case AF_INET:
4289: src.sin.sin_addr = mtod(m, struct ip *)->ip_src;
4290: dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst;
4291: break;
4292: #endif /* INET */
4293: #ifdef INET6
4294: case AF_INET6:
4295: src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src;
4296: dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst;
4297: break;
4298: #endif /* INET6 */
4299: }
4300:
4301: tdb = gettdbbysrcdst(0, &src, &dst, IPPROTO_TCP);
4302: if (tdb == NULL) {
4303: if (m)
4304: m_freem(m);
4305: return (EPERM);
4306: }
4307:
4308: /* Send signature option */
4309: *(optp++) = TCPOPT_SIGNATURE;
4310: *(optp++) = TCPOLEN_SIGNATURE;
4311:
4312: if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th,
4313: hlen, 0, optp) < 0) {
4314: if (m)
4315: m_freem(m);
4316: return (EINVAL);
4317: }
4318: optp += 16;
4319:
4320: /* Pad options list to the next 32 bit boundary and
4321: * terminate it.
4322: */
4323: *optp++ = TCPOPT_NOP;
4324: *optp++ = TCPOPT_EOL;
4325: }
4326: #endif /* TCP_SIGNATURE */
4327:
4328: /* Compute the packet's checksum. */
4329: switch (sc->sc_src.sa.sa_family) {
4330: case AF_INET:
4331: ip->ip_len = htons(tlen - hlen);
4332: th->th_sum = 0;
4333: th->th_sum = in_cksum(m, tlen);
4334: break;
4335: #ifdef INET6
4336: case AF_INET6:
4337: ip6->ip6_plen = htons(tlen - hlen);
4338: th->th_sum = 0;
4339: th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
4340: break;
4341: #endif
4342: }
4343:
4344: /* use IPsec policy and ttl from listening socket, on SYN ACK */
4345: inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL;
4346:
4347: /*
4348: * Fill in some straggling IP bits. Note the stack expects
4349: * ip_len to be in host order, for convenience.
4350: */
4351: switch (sc->sc_src.sa.sa_family) {
4352: #ifdef INET
4353: case AF_INET:
4354: ip->ip_len = htons(tlen);
4355: ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl;
4356: /* XXX tos? */
4357: break;
4358: #endif
4359: #ifdef INET6
4360: case AF_INET6:
4361: ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
4362: ip6->ip6_vfc |= IPV6_VERSION;
4363: ip6->ip6_plen = htons(tlen - hlen);
4364: /* ip6_hlim will be initialized afterwards */
4365: /* leave flowlabel = 0, it is legal and require no state mgmt */
4366: break;
4367: #endif
4368: }
4369:
4370: switch (sc->sc_src.sa.sa_family) {
4371: #ifdef INET
4372: case AF_INET:
4373: error = ip_output(m, sc->sc_ipopts, ro,
4374: (ip_mtudisc ? IP_MTUDISC : 0),
4375: (struct ip_moptions *)NULL, inp);
4376: break;
4377: #endif
4378: #ifdef INET6
4379: case AF_INET6:
4380: ip6->ip6_hlim = in6_selecthlim(NULL,
4381: ro->ro_rt ? ro->ro_rt->rt_ifp : NULL);
4382:
4383: error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 0,
4384: (struct ip6_moptions *)0, NULL, NULL);
4385: break;
4386: #endif
4387: default:
4388: error = EAFNOSUPPORT;
4389: break;
4390: }
4391: return (error);
4392: }
CVSweb