From 9b1aefc41b4d93794f3f96c6c84d478de0c26114 Mon Sep 17 00:00:00 2001 From: neirac Date: Fri, 26 Apr 2024 00:01:23 +0000 Subject: [PATCH] 12455 SO_REUSEPORT Co-authored-by: araragihokuto@outlook.com Co-authored-by: cneirabustos@gmail.com --- usr/src/man/man3socket/getsockopt.3socket | 20 ++ usr/src/pkg/manifests/system-test-ostest.p5m | 9 + usr/src/test/os-tests/runfiles/default.run | 11 + usr/src/test/os-tests/tests/Makefile | 1 + .../test/os-tests/tests/reuseport/Makefile | 67 +++++ .../os-tests/tests/reuseport/rp_test_tcp.c | 105 +++++++ .../tests/reuseport/rp_test_tcp_noopt.c | 94 +++++++ .../tests/reuseport/rp_test_tcp_seteuid.c | 158 +++++++++++ .../tests/reuseport/rp_test_tcp_toggle.c | 106 ++++++++ .../os-tests/tests/reuseport/rp_test_udp.c | 105 +++++++ .../tests/reuseport/rp_test_udp_noopt.c | 94 +++++++ .../tests/reuseport/rp_test_udp_seteuid.c | 158 +++++++++++ .../tests/reuseport/rp_test_udp_toggle.c | 106 ++++++++ usr/src/uts/common/inet/ip/conn_opt.c | 24 ++ usr/src/uts/common/inet/ip/ip.c | 12 +- usr/src/uts/common/inet/ip/ip6.c | 2 +- usr/src/uts/common/inet/ip/ipclassifier.c | 256 ++++++++++++++++++ usr/src/uts/common/inet/ipclassifier.h | 25 ++ usr/src/uts/common/inet/tcp.h | 8 - usr/src/uts/common/inet/tcp/tcp.c | 23 +- usr/src/uts/common/inet/tcp/tcp_bind.c | 190 +++---------- usr/src/uts/common/inet/tcp/tcp_opt_data.c | 103 ------- usr/src/uts/common/inet/tcp/tcp_socket.c | 4 +- usr/src/uts/common/inet/tcp_impl.h | 23 +- usr/src/uts/common/inet/udp/udp.c | 88 +++++- usr/src/uts/common/inet/udp/udp_opt_data.c | 1 + usr/src/uts/common/sys/socket.h | 2 + 27 files changed, 1475 insertions(+), 320 deletions(-) create mode 100644 usr/src/test/os-tests/tests/reuseport/Makefile create mode 100644 usr/src/test/os-tests/tests/reuseport/rp_test_tcp.c create mode 100644 usr/src/test/os-tests/tests/reuseport/rp_test_tcp_noopt.c create mode 100644 usr/src/test/os-tests/tests/reuseport/rp_test_tcp_seteuid.c create mode 100644 usr/src/test/os-tests/tests/reuseport/rp_test_tcp_toggle.c create mode 100644 usr/src/test/os-tests/tests/reuseport/rp_test_udp.c create mode 100644 usr/src/test/os-tests/tests/reuseport/rp_test_udp_noopt.c create mode 100644 usr/src/test/os-tests/tests/reuseport/rp_test_udp_seteuid.c create mode 100644 usr/src/test/os-tests/tests/reuseport/rp_test_udp_toggle.c diff --git a/usr/src/man/man3socket/getsockopt.3socket b/usr/src/man/man3socket/getsockopt.3socket index 7e2c55509f..941273e80b 100644 --- a/usr/src/man/man3socket/getsockopt.3socket +++ b/usr/src/man/man3socket/getsockopt.3socket @@ -105,6 +105,17 @@ enable/disable recording of debugging information enable/disable local address reuse .RE +.sp +.ne 2 +\fB\fBSO_REUSEPORT\fR\fR +.ad +.RS 19n +enable/disable multiple bindings to an identical socket address. +.RE +.sp +.ne 2 +.na + .sp .ne 2 .na @@ -266,6 +277,15 @@ Instead, messages are directed to the appropriate network interface according to the network portion of the destination address. .sp .LP +The \fBSO_REUSEPORT\fR option enables multiple \fBAF_INET\fR or \fBAF_INET6\fR +sockets to be bound to an identical address simutaneously. This option can be +enabled on both TCP and UDP sockets. Incoming connections (on TCP) or packets +(on UDP) will be distributed among sockets bound to the specific address. Enabling +this option on each socket (including the first socket) prior to calling bind(3SOCKET) +is required for this options to take effect. To prevent port hijacking, all of the +processes binding to the same address must have the same effective UID. +.sp +.LP The \fBSO_LINGER\fR option controls the action taken when unsent messages are queued on a socket and a \fBclose\fR(2) is performed. If the socket promises reliable delivery of data and \fBSO_LINGER\fR is set, the system will block the diff --git a/usr/src/pkg/manifests/system-test-ostest.p5m b/usr/src/pkg/manifests/system-test-ostest.p5m index 0481652b7f..5efb129da8 100644 --- a/usr/src/pkg/manifests/system-test-ostest.p5m +++ b/usr/src/pkg/manifests/system-test-ostest.p5m @@ -111,6 +111,15 @@ file path=opt/os-tests/tests/pf_key/acquire-spray mode=0555 file path=opt/os-tests/tests/pf_key/eacq-enabler mode=0555 file path=opt/os-tests/tests/pf_key/kmc-update mode=0555 file path=opt/os-tests/tests/pf_key/kmc-updater mode=0555 +dir path=opt/os-tests/tests/reuseport +file path=opt/os-tests/tests/reuseport/rp_test_tcp mode=0555 +file path=opt/os-tests/tests/reuseport/rp_test_tcp_noopt mode=0555 +file path=opt/os-tests/tests/reuseport/rp_test_tcp_seteuid mode=0555 +file path=opt/os-tests/tests/reuseport/rp_test_tcp_toggle mode=0555 +file path=opt/os-tests/tests/reuseport/rp_test_udp mode=0555 +file path=opt/os-tests/tests/reuseport/rp_test_udp_noopt mode=0555 +file path=opt/os-tests/tests/reuseport/rp_test_udp_seteuid mode=0555 +file path=opt/os-tests/tests/reuseport/rp_test_udp_toggle mode=0555 dir path=opt/os-tests/tests/poll file path=opt/os-tests/tests/poll/epoll_test mode=0555 file path=opt/os-tests/tests/poll/poll_test mode=0555 diff --git a/usr/src/test/os-tests/runfiles/default.run b/usr/src/test/os-tests/runfiles/default.run index 8a83d484c8..cdf9dedea6 100644 --- a/usr/src/test/os-tests/runfiles/default.run +++ b/usr/src/test/os-tests/runfiles/default.run @@ -190,3 +190,14 @@ tests = ['ilstr_basic.32', 'ilstr_basic.64'] [/opt/os-tests/tests/execve] tests = ['execvpe-test.32', 'execvpe-test.64'] + +[/opt/os-tests/tests/reuseport] +user = root +tests = ['rp_test_tcp', + 'rp_test_tcp_noopt', + 'rp_test_tcp_seteuid', + 'rp_test_tcp_toggle', + 'rp_test_udp', + 'rp_test_udp_noopt', + 'rp_test_udp_seteuid', + 'rp_test_udp_toggle'] diff --git a/usr/src/test/os-tests/tests/Makefile b/usr/src/test/os-tests/tests/Makefile index accd1d9f64..49a79b5a43 100644 --- a/usr/src/test/os-tests/tests/Makefile +++ b/usr/src/test/os-tests/tests/Makefile @@ -31,6 +31,7 @@ SUBDIRS = \ libtopo \ pf_key \ poll \ + reuseport \ portfs \ regression \ sdevfs \ diff --git a/usr/src/test/os-tests/tests/reuseport/Makefile b/usr/src/test/os-tests/tests/reuseport/Makefile new file mode 100644 index 0000000000..47892d36de --- /dev/null +++ b/usr/src/test/os-tests/tests/reuseport/Makefile @@ -0,0 +1,67 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2012 by Delphix. All rights reserved. +# Copyright 2016 Joyent, Inc. +# + +include $(SRC)/cmd/Makefile.cmd +include $(SRC)/test/Makefile.com + +PROG = \ + rp_test_tcp \ + rp_test_tcp_noopt \ + rp_test_tcp_seteuid \ + rp_test_tcp_toggle \ + rp_test_udp \ + rp_test_udp_noopt \ + rp_test_udp_seteuid \ + rp_test_udp_toggle + +OBJS = $(PROG:%=%.o) +SRCS = $(OBJS:%.o=%.c) + +LDLIBS += -lsocket +LDLIBS64 += -lsocket + +CSTD = $(CSTD_GNU99) + +ROOTOPTPKG = $(ROOT)/opt/os-tests +TESTDIR = $(ROOTOPTPKG)/tests/reuseport + +CMDS = $(PROG:%=$(TESTDIR)/%) +$(CMDS) := FILEMODE = 0555 + +LINTS = $(PROG:%=%.ln) + +all: $(PROG) + +install: all $(CMDS) + +lint: $(LINTS) + +clobber: clean + -$(RM) $(PROG) + +clean: + -$(RM) $(OBJS) + +%.ln: %.c + $(LINT.c) $< $(UTILS) $(LDLIBS) + +$(CMDS): $(TESTDIR) $(PROG) + +$(TESTDIR): + $(INS.dir) + +$(TESTDIR)/%: % + $(INS.file) diff --git a/usr/src/test/os-tests/tests/reuseport/rp_test_tcp.c b/usr/src/test/os-tests/tests/reuseport/rp_test_tcp.c new file mode 100644 index 0000000000..1131cc554c --- /dev/null +++ b/usr/src/test/os-tests/tests/reuseport/rp_test_tcp.c @@ -0,0 +1,105 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* Copyright 2020 Araragi Hokuto */ + +/* rp_test_tcp.c -- test SO_REUSEPORT on TCP */ + +/* + * This test creates 4 listening socket, and bind + * them to the same address. None of those bind() + * calls is supposed to fail. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define DONTCARE(x) ((void)(x)) + +int +bind_socket(const struct sockaddr_in *addr) +{ + int fd; + fd = socket(AF_INET, SOCK_STREAM, 0); + if (fd < 0) { + perror("socket"); + return (-1); + } + + int optval = 1; + if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, + &optval, sizeof (optval)) < 0) { + perror("setsockopt"); + DONTCARE(close(fd)); + return (-1); + } + + if (bind(fd, (const void *)addr, sizeof (struct sockaddr_in)) < 0) { + perror("bind"); + DONTCARE(close(fd)); + return (-1); + } + + return (fd); +} + +int fda = 0, + fdb = 0, + fdc = 0, + fdd = 0; + +/* close fds before exiting test */ +void +close_fds(void) +{ + if (fda) DONTCARE(close(fda)); + if (fdb) DONTCARE(close(fdb)); + if (fdc) DONTCARE(close(fdc)); + if (fdd) DONTCARE(close(fdd)); +} + +int +main(void) +{ + struct sockaddr_in addr; + memset(&addr, 0, sizeof (addr)); + + addr.sin_family = AF_INET; + addr.sin_port = htons(22331); + + if (inet_pton(AF_INET, "127.0.0.1", &addr) < 0) { + /* inet_pton failure is an exception */ + perror("inet_pton"); + return (-1); + } + + fda = bind_socket(&addr); + fdb = bind_socket(&addr); + fdc = bind_socket(&addr); + fdd = bind_socket(&addr); + + int pass; + pass = fda > 0; + pass = pass && (fdb > 0); + pass = pass && (fdc > 0); + pass = pass && (fdd > 0); + + close_fds(); + return (pass ? 0 : 1); +} diff --git a/usr/src/test/os-tests/tests/reuseport/rp_test_tcp_noopt.c b/usr/src/test/os-tests/tests/reuseport/rp_test_tcp_noopt.c new file mode 100644 index 0000000000..d8d0ad5e72 --- /dev/null +++ b/usr/src/test/os-tests/tests/reuseport/rp_test_tcp_noopt.c @@ -0,0 +1,94 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* Copyright 2020 Araragi Hokuto */ + +/* rp_test_tcp_noopt.c -- test bind(3SOCKET) without SO_REUSEPORT on TCP */ + +/* + * This test creates two AF_INET socket, and try binding them + * to the exact same address, without SO_REUSEPORT set on either + * one. The second bind() is expected to fail in this case. + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define DONTCARE(x) ((void)(x)) + +/* create and bind a socket to the given address */ +int +bind_socket(int *fd, const struct sockaddr_in *addr) +{ + *fd = socket(AF_INET, SOCK_STREAM, 0); + if (*fd < 0) { + /* + * Failed to create socket. + * This is neither PASS or FAIL -- It's an exception. + * return 1 to indicate this scene. + */ + perror("socket"); + return (1); + } + + return (bind(*fd, (const void *)addr, sizeof (struct sockaddr_in))); +} + +int +main(void) +{ + struct sockaddr_in addr; + memset(&addr, 0, sizeof (addr)); + + addr.sin_family = AF_INET; + addr.sin_port = htons(22334); + if (inet_pton(AF_INET, "127.0.0.1", &addr.sin_addr) != 1) { + /* inet_pton failure is an exception */ + perror("inet_pton"); + return (-1); + } + + int fda, fdb; + int bind_ret; + + bind_ret = bind_socket(&fda, &addr); + if (bind_ret == 1) { + /* socket(3SOCKET) failure is an exception */ + return (-1); + } + + if (bind_ret) { + /* failed to bind first socket is an exception */ + perror("bind"); + return (-1); + } + + bind_ret = bind_socket(&fdb, &addr); + if (bind_ret == 1) { + DONTCARE(close(fda)); + return (-1); + } + + int pass; + pass = (bind_ret < 0) && (errno == EADDRINUSE); + + DONTCARE(close(fda)); + DONTCARE(close(fdb)); + + return (pass ? 0 : 1); +} diff --git a/usr/src/test/os-tests/tests/reuseport/rp_test_tcp_seteuid.c b/usr/src/test/os-tests/tests/reuseport/rp_test_tcp_seteuid.c new file mode 100644 index 0000000000..5a7100f954 --- /dev/null +++ b/usr/src/test/os-tests/tests/reuseport/rp_test_tcp_seteuid.c @@ -0,0 +1,158 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* Copyright 2020 Araragi Hokuto */ + +/* + * rp_test_tcp_noopt.c -- test SO_REUSEPORT behaviour + * between different effective UID on TCP + */ + +/* + * This test spawn a subprocess, and let it bind a TCP socket; + * then it switch to euid 101 and try to bind to the same address. + * The second bind should fail. + */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#define DONTCARE(x) ((void)(x)) + +/* create and bind a socket to the given address */ +int +bind_socket(int *fd, const struct sockaddr_in *addr) +{ + *fd = socket(AF_INET, SOCK_STREAM, 0); + if (*fd < 0) { + /* socket() fail is an exception. Return 1 to indicate this. */ + perror("socket"); + return (1); + } + + int optval = 1; + if (setsockopt(*fd, SOL_SOCKET, SO_REUSEPORT, + &optval, sizeof (optval))) { + /* setsockopt() fail is an exception */ + perror("setsockopt"); + return (1); + } + + return (bind(*fd, (const void *)addr, sizeof (struct sockaddr_in))); +} + +int +main(void) +{ + struct sockaddr_in addr; + memset(&addr, 0, sizeof (addr)); + + addr.sin_family = AF_INET; + addr.sin_port = htons(22334); + if (inet_pton(AF_INET, "127.0.0.1", &addr.sin_addr) != 1) { + /* inet_pton failure is an exception */ + perror("inet_pton"); + return (-1); + } + + int fd; + int bind_ret; + + int signal_evfd = eventfd(0, 0); + if (signal_evfd < 0) { + perror("eventfd"); + return (-1); + } + + int exit_evfd = eventfd(0, 0); + if (exit_evfd < 0) { + perror("eventfd"); + return (-1); + } + + pid_t pid = fork(); + + if (pid < 0) { + /* fork() fail is an exception */ + perror("fork"); + return (-1); + } + + + if (pid == 0) { + bind_ret = bind_socket(&fd, &addr); + if (bind_ret < 0) { + /* bind fail in child process is an exception */ + perror("bind"); + + /* write 2 to evfd to indicate exception */ + uint64_t buf = 2; + DONTCARE(write(signal_evfd, &buf, sizeof (buf))); + + DONTCARE(close(fd)); + return (-1); + } + + /* signal parent that we've bound the socket */ + uint64_t buf = 1; + DONTCARE(write(signal_evfd, &buf, sizeof (buf))); + + /* wait for parent signal */ + buf = 0; + DONTCARE(read(exit_evfd, &buf, sizeof (buf))); + assert(buf == 1); + + DONTCARE(close(fd)); + return (-1); + } + + /* parent process */ + + /* wait for child process to signal */ + uint64_t buf = 0; + DONTCARE(read(signal_evfd, &buf, sizeof (buf))); + + if (buf == 2) { + /* exception in child process */ + return (-1); + } + assert(buf == 1); + + if (seteuid(101) < 0) { + perror("seteuid"); + /* signal child to exit */ + buf = 1; + DONTCARE(write(exit_evfd, &buf, sizeof (buf))); + return (-1); + } + + bind_ret = bind_socket(&fd, &addr); + + int pass; + pass = (bind_ret < 0) && (errno == EADDRNOTAVAIL); + + /* signal child to exit */ + buf = 1; + DONTCARE(write(exit_evfd, &buf, sizeof (buf))); + + DONTCARE(close(fd)); + + return (pass ? 0 : 1); +} diff --git a/usr/src/test/os-tests/tests/reuseport/rp_test_tcp_toggle.c b/usr/src/test/os-tests/tests/reuseport/rp_test_tcp_toggle.c new file mode 100644 index 0000000000..2f18158103 --- /dev/null +++ b/usr/src/test/os-tests/tests/reuseport/rp_test_tcp_toggle.c @@ -0,0 +1,106 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* Copyright 2020 Araragi Hokuto */ + +/* rp_test_tcp_toggle.c -- test SO_REUSEPORT toggle behaviour on TCP */ + +/* + * This test create and bind a TCP socket, with + * SO_REUSEPORT enabled; then it disables the option, + * and try to bind another socket. Both bind() call + * should success. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define DONTCARE(x) ((void)(x)) + +int +bind_socket(const struct sockaddr_in *addr) +{ + int fd; + fd = socket(AF_INET, SOCK_STREAM, 0); + if (fd < 0) { + perror("socket"); + return (-1); + } + + int optval = 1; + if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, + &optval, sizeof (optval)) < 0) { + perror("setsockopt"); + DONTCARE(close(fd)); + return (-1); + } + + if (bind(fd, (const void *)addr, sizeof (struct sockaddr_in)) < 0) { + perror("bind"); + DONTCARE(close(fd)); + return (-1); + } + + return (fd); +} + +int fda = 0, + fdb = 0; + +/* close fds before exiting test */ +void +close_fds(void) +{ + DONTCARE(close(fda)); + DONTCARE(close(fdb)); +} + +int +main(void) +{ + struct sockaddr_in addr; + memset(&addr, 0, sizeof (addr)); + + addr.sin_family = AF_INET; + addr.sin_port = htons(22331); + + if (inet_pton(AF_INET, "127.0.0.1", &addr) < 0) { + /* inet_pton failure is an exception */ + perror("inet_pton"); + return (-1); + } + + fda = bind_socket(&addr); + + int optval = 0; + if (setsockopt(fda, SOL_SOCKET, SO_REUSEPORT, + &optval, sizeof (optval)) < 0) { + perror("setsockopt 0"); + return (-1); + } + + fdb = bind_socket(&addr); + + int pass; + pass = fda > 0; + pass = pass && (fdb > 0); + + close_fds(); + return (pass ? 0 : 1); +} diff --git a/usr/src/test/os-tests/tests/reuseport/rp_test_udp.c b/usr/src/test/os-tests/tests/reuseport/rp_test_udp.c new file mode 100644 index 0000000000..ffd333066f --- /dev/null +++ b/usr/src/test/os-tests/tests/reuseport/rp_test_udp.c @@ -0,0 +1,105 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* Copyright 2020 Araragi Hokuto */ + +/* rp_test_tcp.c -- test SO_REUSEPORT on UDP */ + +/* + * This test creates 4 SO_DGRAM socket, and bind + * them to the same address. None of those bind() + * calls is supposed to fail. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define DONTCARE(x) ((void)(x)) + +int +bind_socket(const struct sockaddr_in *addr) +{ + int fd; + fd = socket(AF_INET, SOCK_DGRAM, 0); + if (fd < 0) { + perror("socket"); + return (-1); + } + + int optval = 1; + if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, + &optval, sizeof (optval)) < 0) { + perror("setsockopt"); + DONTCARE(close(fd)); + return (-1); + } + + if (bind(fd, (const void *)addr, sizeof (struct sockaddr_in)) < 0) { + perror("bind"); + DONTCARE(close(fd)); + return (-1); + } + + return (fd); +} + +int fda = 0, + fdb = 0, + fdc = 0, + fdd = 0; + +/* close fds before exiting test */ +void +close_fds(void) +{ + if (fda) DONTCARE(close(fda)); + if (fdb) DONTCARE(close(fdb)); + if (fdc) DONTCARE(close(fdc)); + if (fdd) DONTCARE(close(fdd)); +} + +int +main(void) +{ + struct sockaddr_in addr; + memset(&addr, 0, sizeof (addr)); + + addr.sin_family = AF_INET; + addr.sin_port = htons(22331); + + if (inet_pton(AF_INET, "127.0.0.1", &addr) < 0) { + /* inet_pton failure is an exception */ + perror("inet_pton"); + return (-1); + } + + fda = bind_socket(&addr); + fdb = bind_socket(&addr); + fdc = bind_socket(&addr); + fdd = bind_socket(&addr); + + int pass; + pass = fda > 0; + pass = pass && (fdb > 0); + pass = pass && (fdc > 0); + pass = pass && (fdd > 0); + + close_fds(); + return (pass ? 0 : 1); +} diff --git a/usr/src/test/os-tests/tests/reuseport/rp_test_udp_noopt.c b/usr/src/test/os-tests/tests/reuseport/rp_test_udp_noopt.c new file mode 100644 index 0000000000..be83a7b2ba --- /dev/null +++ b/usr/src/test/os-tests/tests/reuseport/rp_test_udp_noopt.c @@ -0,0 +1,94 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* Copyright 2020 Araragi Hokuto */ + +/* rp_test_udp_noopt.c -- test bind(3SOCKET) without SO_REUSEPORT on UDP */ + +/* + * This test creates two AF_INET socket, and try binding them + * to the exact same address, without SO_REUSEPORT set on either + * one. The second bind() is expected to fail in this case. + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define DONTCARE(x) ((void)(x)) + +/* create and bind a socket to the given address */ +int +bind_socket(int *fd, const struct sockaddr_in *addr) +{ + *fd = socket(AF_INET, SOCK_DGRAM, 0); + if (*fd < 0) { + /* + * Failed to create socket. + * This is neither PASS or FAIL -- It's an exception. + * return 1 to indicate this scene. + */ + perror("socket"); + return (1); + } + + return (bind(*fd, (const void *)addr, sizeof (struct sockaddr_in))); +} + +int +main(void) +{ + struct sockaddr_in addr; + memset(&addr, 0, sizeof (addr)); + + addr.sin_family = AF_INET; + addr.sin_port = htons(22334); + if (inet_pton(AF_INET, "127.0.0.1", &addr.sin_addr) != 1) { + /* inet_pton failure is an exception */ + perror("inet_pton"); + return (-1); + } + + int fda, fdb; + int bind_ret; + + bind_ret = bind_socket(&fda, &addr); + if (bind_ret == 1) { + /* socket(3SOCKET) failure is an exception */ + return (-1); + } + + if (bind_ret) { + /* failed to bind first socket is an exception */ + perror("bind"); + return (-1); + } + + bind_ret = bind_socket(&fdb, &addr); + if (bind_ret == 1) { + DONTCARE(close(fda)); + return (-1); + } + + int pass; + pass = (bind_ret < 0) && (errno == EADDRINUSE); + + DONTCARE(close(fda)); + DONTCARE(close(fdb)); + + return (pass ? 0 : 1); +} diff --git a/usr/src/test/os-tests/tests/reuseport/rp_test_udp_seteuid.c b/usr/src/test/os-tests/tests/reuseport/rp_test_udp_seteuid.c new file mode 100644 index 0000000000..41979e7354 --- /dev/null +++ b/usr/src/test/os-tests/tests/reuseport/rp_test_udp_seteuid.c @@ -0,0 +1,158 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* Copyright 2020 Araragi Hokuto */ + +/* + * rp_test_tcp_noopt.c -- test SO_REUSEPORT behaviour + * between different effective UID on TCP + */ + +/* + * This test spawn a subprocess, and let it bind a TCP socket; + * then it switch to euid 101 and try to bind to the same address. + * The second bind should fail. + */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#define DONTCARE(x) ((void)(x)) + +/* create and bind a socket to the given address */ +int +bind_socket(int *fd, const struct sockaddr_in *addr) +{ + *fd = socket(AF_INET, SOCK_DGRAM, 0); + if (*fd < 0) { + /* socket() fail is an exception. Return 1 to indicate this. */ + perror("socket"); + return (1); + } + + int optval = 1; + if (setsockopt(*fd, SOL_SOCKET, SO_REUSEPORT, + &optval, sizeof (optval))) { + /* setsockopt() fail is an exception */ + perror("setsockopt"); + return (1); + } + + return (bind(*fd, (const void *)addr, sizeof (struct sockaddr_in))); +} + +int +main(void) +{ + struct sockaddr_in addr; + memset(&addr, 0, sizeof (addr)); + + addr.sin_family = AF_INET; + addr.sin_port = htons(22334); + if (inet_pton(AF_INET, "127.0.0.1", &addr.sin_addr) != 1) { + /* inet_pton failure is an exception */ + perror("inet_pton"); + return (-1); + } + + int fd; + int bind_ret; + + int signal_evfd = eventfd(0, 0); + if (signal_evfd < 0) { + perror("eventfd"); + return (-1); + } + + int exit_evfd = eventfd(0, 0); + if (exit_evfd < 0) { + perror("eventfd"); + return (-1); + } + + pid_t pid = fork(); + + if (pid < 0) { + /* fork() fail is an exception */ + perror("fork"); + return (-1); + } + + + if (pid == 0) { + bind_ret = bind_socket(&fd, &addr); + if (bind_ret < 0) { + /* bind fail in child process is an exception */ + perror("bind"); + + /* write 2 to evfd to indicate exception */ + uint64_t buf = 2; + DONTCARE(write(signal_evfd, &buf, sizeof (buf))); + + DONTCARE(close(fd)); + return (-1); + } + + /* signal parent that we've bound the socket */ + uint64_t buf = 1; + DONTCARE(write(signal_evfd, &buf, sizeof (buf))); + + /* wait for parent signal */ + buf = 0; + DONTCARE(read(exit_evfd, &buf, sizeof (buf))); + assert(buf == 1); + + DONTCARE(close(fd)); + return (-1); + } + + /* parent process */ + + /* wait for child process to signal */ + uint64_t buf = 0; + DONTCARE(read(signal_evfd, &buf, sizeof (buf))); + + if (buf == 2) { + /* exception in child process */ + return (-1); + } + assert(buf == 1); + + if (seteuid(101) < 0) { + perror("seteuid"); + /* signal child to exit */ + buf = 1; + DONTCARE(write(exit_evfd, &buf, sizeof (buf))); + return (-1); + } + + bind_ret = bind_socket(&fd, &addr); + + int pass; + pass = (bind_ret < 0) && (errno == EADDRNOTAVAIL); + + /* signal child to exit */ + buf = 1; + DONTCARE(write(exit_evfd, &buf, sizeof (buf))); + + DONTCARE(close(fd)); + + return (pass ? 0 : 1); +} diff --git a/usr/src/test/os-tests/tests/reuseport/rp_test_udp_toggle.c b/usr/src/test/os-tests/tests/reuseport/rp_test_udp_toggle.c new file mode 100644 index 0000000000..4283e38d5f --- /dev/null +++ b/usr/src/test/os-tests/tests/reuseport/rp_test_udp_toggle.c @@ -0,0 +1,106 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* Copyright 2020 Araragi Hokuto */ + +/* rp_test_udp_toggle.c -- test SO_REUSEPORT toggle behaviour on UDP */ + +/* + * This test create and bind a UDP socket, with + * SO_REUSEPORT enabled; then it disables the option, + * and try to bind another socket. Both bind() call + * should success. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define DONTCARE(x) ((void)(x)) + +int +bind_socket(const struct sockaddr_in *addr) +{ + int fd; + fd = socket(AF_INET, SOCK_DGRAM, 0); + if (fd < 0) { + perror("socket"); + return (-1); + } + + int optval = 1; + if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, + &optval, sizeof (optval)) < 0) { + perror("setsockopt"); + DONTCARE(close(fd)); + return (-1); + } + + if (bind(fd, (const void *)addr, sizeof (struct sockaddr_in)) < 0) { + perror("bind"); + DONTCARE(close(fd)); + return (-1); + } + + return (fd); +} + +int fda = 0, + fdb = 0; + +/* close fds before exiting test */ +void +close_fds(void) +{ + DONTCARE(close(fda)); + DONTCARE(close(fdb)); +} + +int +main(void) +{ + struct sockaddr_in addr; + memset(&addr, 0, sizeof (addr)); + + addr.sin_family = AF_INET; + addr.sin_port = htons(22331); + + if (inet_pton(AF_INET, "127.0.0.1", &addr) < 0) { + /* inet_pton failure is an exception */ + perror("inet_pton"); + return (-1); + } + + fda = bind_socket(&addr); + + int optval = 0; + if (setsockopt(fda, SOL_SOCKET, SO_REUSEPORT, + &optval, sizeof (optval)) < 0) { + perror("setsockopt"); + return (-1); + } + + fdb = bind_socket(&addr); + + int pass; + pass = fda > 0; + pass = pass && (fdb > 0); + + close_fds(); + return (pass ? 0 : 1); +} diff --git a/usr/src/uts/common/inet/ip/conn_opt.c b/usr/src/uts/common/inet/ip/conn_opt.c index eeec56b162..fb09023609 100644 --- a/usr/src/uts/common/inet/ip/conn_opt.c +++ b/usr/src/uts/common/inet/ip/conn_opt.c @@ -1132,6 +1132,30 @@ conn_opt_set_socket(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen, case SO_REUSEADDR: connp->conn_reuseaddr = onoff; break; + + case SO_REUSEPORT: + if (!IPCL_IS_NONSTR(connp)) { + if (onoff) { + /* + * SO_REUSEPORT cannot be enabled on sockets + * which have fallen back to the STREAMS API. + */ + return (EINVAL); + } else { + /* + * A connection with SO_REUSEPORT enabled + * should be prevented from falling back to + * STREAMS mode via logic in tcp_fallback. + * It is legal, however, for fallen-back + * connections to affirm the disabled state + * of SO_REUSEPORT. + */ + ASSERT(connp->conn_reuseport == 0); + break; + } + } + connp->conn_reuseport = onoff; + break; case SO_DONTROUTE: if (onoff) ixa->ixa_flags |= IXAF_DONTROUTE; diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c index 704f152bb9..1f027bf417 100644 --- a/usr/src/uts/common/inet/ip/ip.c +++ b/usr/src/uts/common/inet/ip/ip.c @@ -5279,7 +5279,7 @@ ip_fanout_udp_conn(conn_t *connp, mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, * Fanout for UDP packets that are multicast or broadcast, and ICMP errors. * (Unicast fanout is handled in ip_input_v4.) * - * If SO_REUSEADDR is set all multicast and broadcast packets + * If SO_REUSEADDR or SO_REUSEPORT is set all multicast and broadcast packets * will be delivered to all conns bound to the same port. * * If there is at least one matching AF_INET receiver, then we will @@ -5316,8 +5316,9 @@ ip_fanout_udp_multi_v4(mblk_t *mp, ipha_t *ipha, uint16_t lport, uint16_t fport, connp = connfp->connf_head; /* - * If SO_REUSEADDR has been set on the first we send the + * If SO_REUSEADDR or SO_REUSEPORT has been set on the * packet to all clients that have joined the group and + * first we send the packet to all clients that have * match the port. */ while (connp != NULL) { @@ -5334,7 +5335,7 @@ ip_fanout_udp_multi_v4(mblk_t *mp, ipha_t *ipha, uint16_t lport, uint16_t fport, CONN_INC_REF(connp); - if (connp->conn_reuseaddr) { + if (connp->conn_reuseaddr || connp->conn_reuseport) { conn_t *first_connp = connp; conn_t *next_connp; mblk_t *mp1; @@ -5445,11 +5446,12 @@ notfound: ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL); /* - * If SO_REUSEADDR has been set on the first we send the + * If SO_REUSEADDR or SO_REUSEPORT has been set on the * packet to all clients that have joined the group and + * first we send the packet to all clients that have * match the port. */ - if (connp->conn_reuseaddr) { + if (connp->conn_reuseaddr || connp->conn_reuseport ) { conn_t *first_connp = connp; conn_t *next_connp; mblk_t *mp1; diff --git a/usr/src/uts/common/inet/ip/ip6.c b/usr/src/uts/common/inet/ip/ip6.c index 15ca8adbaa..e6ccd35a8b 100644 --- a/usr/src/uts/common/inet/ip/ip6.c +++ b/usr/src/uts/common/inet/ip/ip6.c @@ -2501,7 +2501,7 @@ ip_fanout_udp_multi_v6(mblk_t *mp, ip6_t *ip6h, uint16_t lport, uint16_t fport, CONN_INC_REF(connp); - if (connp->conn_reuseaddr) { + if (connp->conn_reuseaddr || connp->conn_reuseport) { conn_t *first_connp = connp; conn_t *next_connp; mblk_t *mp1; diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c index d47997a4aa..545d4791ce 100644 --- a/usr/src/uts/common/inet/ip/ipclassifier.c +++ b/usr/src/uts/common/inet/ip/ipclassifier.c @@ -1637,6 +1637,17 @@ ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, if (connp != NULL) { /* Have a listener at least */ + if (connp->conn_rg_bind != NULL) { + /* + * Have multiple bindings by SO_REUSEPORT, + * do load balancing + */ + connp = conn_rg_lb_pick( + connp->conn_rg_bind, + ipha->ipha_src, + ipha->ipha_dst, + ports); + } CONN_INC_REF(connp); mutex_exit(&bind_connfp->connf_lock); return (connp); @@ -1671,6 +1682,17 @@ ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, } if (connp != NULL) { + if (connp->conn_rg_bind != NULL) { + /* + * Have multiple bindings by SO_REUSEPORT, + * do load balancing + */ + connp = conn_rg_lb_pick( + connp->conn_rg_bind, + ipha->ipha_src, + ipha->ipha_dst, + ports); + } CONN_INC_REF(connp); mutex_exit(&connfp->connf_lock); return (connp); @@ -1772,6 +1794,17 @@ ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, if (connp != NULL) { /* Have a listner at least */ + if (connp->conn_rg_bind != NULL) { + /* + * Have multiple SO_REUSEPORT bind, + * do load balancing + */ + connp = conn_rg_lb_pick6( + connp->conn_rg_bind, + &ip6h->ip6_src, + &ip6h->ip6_dst, + ports); + } CONN_INC_REF(connp); mutex_exit(&bind_connfp->connf_lock); return (connp); @@ -1808,6 +1841,17 @@ ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, } if (connp != NULL) { + if (connp->conn_rg_bind != NULL) { + /* + * Have multiple SO_REUSEPORT bind, + * do load balancing + */ + connp = conn_rg_lb_pick6( + connp->conn_rg_bind, + &ip6h->ip6_src, + &ip6h->ip6_dst, + ports); + } CONN_INC_REF(connp); mutex_exit(&connfp->connf_lock); return (connp); @@ -2819,3 +2863,215 @@ conn_get_socket_info(conn_t *connp, mib2_socketInfoEntry_t *sie) return (sie); } + +/* + * SO_REUSEPORT support + * + * The SO_REUSEPORT option allows multiple socket + * to be bound to an identical address, and incoming + * connections or datagrams will be distributed among + * all the sockets bound to the same address, as per + * Linux sematics. + * + * To support this behaviour, we add conn_rg_t, which + * is a local table of all the conn_t belonging to the + * same SO_REUSEPORT group. This table will be allocated + * when first conn_t gets bind (in tcp_bindi()/udp_do_bind()), + * and will be destroyed when the last member of the group + * is removed from the bind hash. + * + * In the ipclassifier, when we find a matching conn_t + * for the incoming packet, we check if it's in a SO_REUSEPORT + * group (i.e. it's conn_rg_bind pointer is not NULL). If + * true, then instead of dispatching the packet to the first + * matching conn_t, we try to do load-balancing by picing + * a connection from the group, based on a hash value of the + * IP 4-tuple. + * + * The conn_rg_t.connrg_lock is for protecting conn_rg_t + * structure, and should only be acquired inside conn_rg_* + * funcitons. The conn_rg_bind pointer in conn_t is protected + * in the same way as other fields in conn_t. + */ +/* Max number of members in TCP SO_REUSEPORT group */ +#define CONN_RG_SIZE_MAX 256 +/* Step size when expanding members array */ +#define CONN_RG_SIZE_STEP 4 +/* Initial size of members array */ +#define CONN_RG_SIZE_INIT 4 +/* Initialize a conn_rg_t structure */ +conn_rg_t * +conn_rg_init(conn_t *connp) +{ + conn_rg_t *rg; + rg = kmem_alloc(sizeof (conn_rg_t), KM_NOSLEEP|KM_NORMALPRI); + if (rg == NULL) + return (NULL); + rg->connrg_members = kmem_zalloc(CONN_RG_SIZE_INIT * sizeof (conn_t *), + KM_NOSLEEP|KM_NORMALPRI); + if (rg->connrg_members == NULL) { + kmem_free(rg, sizeof (conn_rg_t)); + return (NULL); + } + mutex_init(&rg->connrg_lock, NULL, MUTEX_DEFAULT, NULL); + rg->connrg_size = CONN_RG_SIZE_INIT; + /* insert connp as the first member */ + rg->connrg_count = 1; + rg->connrg_members[0] = connp; + return (rg); +} +/* + * Destroy a conn_rg_t structure + * All conn_t in the group must be removed beforehand + */ +void +conn_rg_destroy(conn_rg_t *rg) +{ + mutex_enter(&rg->connrg_lock); + ASSERT(rg->connrg_count == 0); + kmem_free(rg->connrg_members, rg->connrg_size * sizeof (conn_t *)); + mutex_destroy(&rg->connrg_lock); + kmem_free(rg, sizeof (conn_rg_t)); +} +/* + * Check if all the connections in rg have the same effective UID. + * If true, add connp into the connection group. + */ +int +conn_rg_insert(conn_rg_t *rg, conn_t *connp) +{ + mutex_enter(&rg->connrg_lock); + VERIFY(rg->connrg_size > 0); + VERIFY(rg->connrg_count <= rg->connrg_size); + if (rg->connrg_count != 0) { + cred_t *oldcred = rg->connrg_members[0]->conn_cred; + cred_t *newcred = connp->conn_cred; + if (crgetuid(oldcred) != crgetuid(newcred) || + crgetzoneid(oldcred) != crgetzoneid(newcred)) { + mutex_exit(&rg->connrg_lock); + return (EADDRNOTAVAIL); + } + } + if (rg->connrg_count == rg->connrg_size) { + uint_t oldalloc = rg->connrg_size * sizeof (conn_t *); + uint_t newsize = rg->connrg_size + CONN_RG_SIZE_STEP; + conn_t **newmembers; + if (newsize > CONN_RG_SIZE_MAX) { + mutex_exit(&rg->connrg_lock); + return (EINVAL); + } + /* expand hash table */ + newmembers = kmem_zalloc(newsize * sizeof (conn_t *), + KM_NOSLEEP|KM_NORMALPRI); + if (newmembers == NULL) { + mutex_exit(&rg->connrg_lock); + return (ENOMEM); + } + bcopy(rg->connrg_members, newmembers, oldalloc); + kmem_free(rg->connrg_members, oldalloc); + rg->connrg_members = newmembers; + rg->connrg_size = newsize; + } + rg->connrg_members[rg->connrg_count] = connp; + rg->connrg_count++; + mutex_exit(&rg->connrg_lock); + return (0); +} +/* + * Remove a connection from the given group + * Returns number of connection left in the group + */ +uint_t +conn_rg_remove(conn_rg_t *rg, conn_t *connp) +{ + uint_t i; + uint_t count_remaining; + mutex_enter(&rg->connrg_lock); + for (i = 0; i < rg->connrg_count; i++) { + if (rg->connrg_members[i] == connp) + break; + } + /* The item should be present */ + ASSERT(i < rg->connrg_count); + /* Move the last member into this position */ + rg->connrg_count--; + rg->connrg_members[i] = rg->connrg_members[rg->connrg_count]; + rg->connrg_members[rg->connrg_count] = NULL; + count_remaining = rg->connrg_count; + mutex_exit(&rg->connrg_lock); + return (count_remaining); +} +/* Hash one uint32_t into hash value using DJBX33A */ +static uint32_t +conn_rg_lb_hash_uint32(uint32_t value, uint32_t addr) +{ + value = (value << 5) + value + (addr & 0xFF); + value = (value << 5) + value + (addr >> 8) & 0xFF; + value = (value << 5) + value + (addr >> 16) & 0xFF; + value = (value << 5) + value + (addr >> 24); + return (value); +} +/* Hash one in6_addr_t into hash value using DJBX33A */ +static uint32_t +conn_rg_lb_hash_in6_addr(uint32_t value, const in6_addr_t *addr) +{ + value = conn_rg_lb_hash_uint32(value, addr->_S6_un._S6_u32[0]); + value = conn_rg_lb_hash_uint32(value, addr->_S6_un._S6_u32[1]); + value = conn_rg_lb_hash_uint32(value, addr->_S6_un._S6_u32[2]); + value = conn_rg_lb_hash_uint32(value, addr->_S6_un._S6_u32[3]); + return (value); +} +/* Calculate DJBX33A Hash from a IPv4 4-tuple */ +static uint32_t +conn_rg_lb_hash(ipaddr_t laddr, ipaddr_t faddr, uint32_t ports) +{ + uint32_t value = 0; + value = conn_rg_lb_hash_uint32(value, laddr); + value = conn_rg_lb_hash_uint32(value, faddr); + value = conn_rg_lb_hash_uint32(value, ports); + return (value); +} +/* Calculate DJBX33A Hash from a IPv6 4-tuple */ +static uint32_t +conn_rg_lb_hash6(const in6_addr_t *laddr, const in6_addr_t *faddr, + uint32_t ports) +{ + uint32_t value = 0; + value = conn_rg_lb_hash_in6_addr(value, laddr); + value = conn_rg_lb_hash_in6_addr(value, faddr); + value = conn_rg_lb_hash_uint32(value, ports); + return (value); +} +/* + * Pick a connection from the given group, in a load-balaced way + * A DJBX33A Hash based algorithm is used here. + * We do not need a fancy hash algorithm here, since this is only + * for load-balacing (which is only best effort), and it's not cryptography + * related so there is no security concern. + */ +conn_t * +conn_rg_lb_pick(conn_rg_t *rg, ipaddr_t src, ipaddr_t dst, uint32_t ports) +{ + uint32_t idx = conn_rg_lb_hash(src, dst, ports); + mutex_enter(&rg->connrg_lock); + idx %= rg->connrg_count; + conn_t *ret = rg->connrg_members[idx]; + mutex_exit(&rg->connrg_lock); + return (ret); +} +/* + * Pick a connection from the given group, in a load-balaced way, + * utilizing IPv6 addresses. + */ +conn_t * +conn_rg_lb_pick6(conn_rg_t *rg, const in6_addr_t *src, const in6_addr_t *dst, + uint32_t ports) +{ + uint32_t idx = conn_rg_lb_hash6(src, dst, ports); + mutex_enter(&rg->connrg_lock); + idx %= rg->connrg_count; + conn_t *ret = rg->connrg_members[idx]; + mutex_exit(&rg->connrg_lock); + return (ret); +} + diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h index 70cff374a4..efb9e4b18a 100644 --- a/usr/src/uts/common/inet/ipclassifier.h +++ b/usr/src/uts/common/inet/ipclassifier.h @@ -222,6 +222,20 @@ typedef struct crb_s { #define crb_recvucred crbu.crbb.crbb_recvucred #define crb_timestamp crbu.crbb.crbb_timestamp +/* + * Track conn_t entities bound to the same port/address tuple via SO_REUSEPORT. + * - connrg_lock: Protects the other fields + * - connrg_size: Allocated size (in entries) of connrg_members array + * - connrg_count: Count of occupied connrg_members slots + * - connrg_members: Connections associated with address/port group + */ +typedef struct conn_rg_s { + kmutex_t connrg_lock; + uint_t connrg_size; + uint_t connrg_count; + conn_t **connrg_members; +} conn_rg_t; + /* * The initial fields in the conn_t are setup by the kmem_cache constructor, * and are preserved when it is freed. Fields after that are bzero'ed when @@ -331,6 +345,8 @@ struct conn_s { connf_t *conn_fanout; /* Hash bucket we're part of */ struct conn_s *conn_next; /* Hash chain next */ struct conn_s *conn_prev; /* Hash chain prev */ + /* Group of conn_s bound to same address/port pair by SO_REUSEPORT */ + conn_rg_t *conn_rg_bind; struct { in6_addr_t connua_laddr; /* Local address - match */ @@ -740,6 +756,15 @@ extern int ip_helper_stream_setup(queue_t *, dev_t *, int, int, extern mib2_socketInfoEntry_t *conn_get_socket_info(conn_t *, mib2_socketInfoEntry_t *); +/* connection group manipulation */ +conn_rg_t *conn_rg_init(conn_t *); +void conn_rg_destroy(conn_rg_t *); +int conn_rg_insert(conn_rg_t *, conn_t *); +uint_t conn_rg_remove(conn_rg_t *, conn_t *); +conn_t *conn_rg_lb_pick(conn_rg_t *, ipaddr_t, ipaddr_t, uint32_t); +conn_t *conn_rg_lb_pick6(conn_rg_t *, const in6_addr_t *, const in6_addr_t *, + uint32_t); + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h index 3ed2b7174a..46e07a64c0 100644 --- a/usr/src/uts/common/inet/tcp.h +++ b/usr/src/uts/common/inet/tcp.h @@ -137,7 +137,6 @@ typedef struct tcphdra_s { struct conn_s; struct tcp_listen_cnt_s; -struct tcp_rg_s; /* * Control structure for each open TCP stream, @@ -408,13 +407,6 @@ typedef struct tcp_s { struct tcp_s *tcp_bind_hash_port; /* tcp_t's bound to the same lport */ struct tcp_s **tcp_ptpbhn; - /* - * Group of tcp_t entries bound to the same adress and port via - * SO_REUSEPORT. The pointer itself is protected by tf_lock in the - * containing tcps_bind_fanout slot. - */ - struct tcp_rg_s *tcp_rg_bind; - uint_t tcp_maxpsz_multiplier; uint32_t tcp_lso_max; /* maximum LSO payload */ diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c index 427a6df274..c99eacf34a 100644 --- a/usr/src/uts/common/inet/tcp/tcp.c +++ b/usr/src/uts/common/inet/tcp/tcp.c @@ -1427,21 +1427,6 @@ tcp_free(tcp_t *tcp) if (tcp->tcp_cc_algo != NULL && tcp->tcp_cc_algo->cb_destroy != NULL) tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv); - /* - * Destroy any association with SO_REUSEPORT group. - */ - if (tcp->tcp_rg_bind != NULL) { - /* - * This is only necessary for connections which enabled - * SO_REUSEPORT but were never bound. Such connections should - * be the one and only member of the tcp_rg_tp to which they - * have been associated. - */ - VERIFY(tcp_rg_remove(tcp->tcp_rg_bind, tcp)); - tcp_rg_destroy(tcp->tcp_rg_bind); - tcp->tcp_rg_bind = NULL; - } - /* * If this is a non-STREAM socket still holding on to an upper * handle, release it. As a result of fallback we might also see @@ -1643,9 +1628,9 @@ tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport, lport = tcp_update_next_port(tcps->tcps_next_port_to_try, tcp, B_TRUE); lport = tcp_bindi(tcp, lport, &connp->conn_laddr_v6, 0, B_TRUE, - B_FALSE, B_FALSE); + B_FALSE, B_FALSE, &error); if (lport == 0) - return (-TNOADDR); + return (error); } /* @@ -1739,9 +1724,9 @@ tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp, in_port_t dstport, lport = tcp_update_next_port(tcps->tcps_next_port_to_try, tcp, B_TRUE); lport = tcp_bindi(tcp, lport, &connp->conn_laddr_v6, 0, B_TRUE, - B_FALSE, B_FALSE); + B_FALSE, B_FALSE, &error); if (lport == 0) - return (-TNOADDR); + return (error); } /* diff --git a/usr/src/uts/common/inet/tcp/tcp_bind.c b/usr/src/uts/common/inet/tcp/tcp_bind.c index 5c2e1e1932..4a53081022 100644 --- a/usr/src/uts/common/inet/tcp/tcp_bind.c +++ b/usr/src/uts/common/inet/tcp/tcp_bind.c @@ -45,6 +45,7 @@ #include #include +#include #include #include #include @@ -57,8 +58,6 @@ static uint32_t tcp_random_anon_port = 1; static int tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t, cred_t *cr); static in_port_t tcp_get_next_priv_port(const tcp_t *); -static int tcp_rg_insert(tcp_rg_t *, struct tcp_s *); - /* * Hash list insertion routine for tcp_t structures. Each hash bucket * contains a list of tcp_t entries, and each entry is bound to a unique @@ -176,13 +175,13 @@ tcp_bind_hash_remove(tcp_t *tcp) ASSERT(lockp != NULL); mutex_enter(lockp); - /* destroy any association with SO_REUSEPORT group */ - if (tcp->tcp_rg_bind != NULL) { - if (tcp_rg_remove(tcp->tcp_rg_bind, tcp)) { + /* Destroy any association with SO_REUSEPORT group */ + if (connp->conn_rg_bind != NULL) { + if (conn_rg_remove(connp->conn_rg_bind, connp) == 0) { /* Last one out turns off the lights */ - tcp_rg_destroy(tcp->tcp_rg_bind); + conn_rg_destroy(connp->conn_rg_bind); } - tcp->tcp_rg_bind = NULL; + connp->conn_rg_bind = NULL; } if (tcp->tcp_ptpbhn) { @@ -491,10 +490,10 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, } connp->conn_mlp_type = mlptype; } - + int errcode; allocated_port = tcp_bindi(tcp, requested_port, &v6addr, connp->conn_reuseaddr, B_FALSE, bind_to_req_port_only, - user_specified); + user_specified, &errcode); if (allocated_port == 0) { connp->conn_mlp_type = mlptSingle; @@ -509,7 +508,10 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, SL_ERROR|SL_TRACE, "tcp_bind: requested addr busy"); } - return (-TADDRBUSY); + if (errcode == TNOADDR) { + errcode = -TADDRBUSY; + } + return (errcode); } else { /* If we are out of ports, fail the bind. */ if (connp->conn_debug) { @@ -656,7 +658,6 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, * If "bind_to_req_port_only" parameter is not set and the requested port * number is available, return it. If not, return the first anonymous port we * happen across. If no anonymous ports are available, return 0. - * * In either case, when succeeding update the tcp_t to record the port number * and insert it in the bind hash table. * @@ -667,7 +668,8 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, in_port_t tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, int reuseaddr, boolean_t quick_connect, - boolean_t bind_to_req_port_only, boolean_t user_specified) + boolean_t bind_to_req_port_only, boolean_t user_specified, + int *errcode) { /* number of times we have run around the loop */ int count = 0; @@ -676,6 +678,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, conn_t *connp = tcp->tcp_connp; tcp_stack_t *tcps = tcp->tcp_tcps; boolean_t reuseport = connp->conn_reuseport; + *errcode = -TNOADDR; /* * Lookup for free addresses is done in a loop and "loopmax" @@ -858,7 +861,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, * binding if it too had SO_REUSEPORT enabled * when it was bound. */ - attempt_reuse = (ltcp->tcp_rg_bind != NULL); + attempt_reuse = (lconnp->conn_rg_bind != NULL); break; } @@ -907,28 +910,22 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, mutex_exit(&tbf->tf_lock); } else { if (attempt_reuse) { + /* Attempt to join the existing group */ int err; - struct tcp_rg_s *rg; - + conn_rg_t *rg; ASSERT(ltcp != NULL); - ASSERT(ltcp->tcp_rg_bind != NULL); - ASSERT(tcp->tcp_rg_bind != NULL); - ASSERT(ltcp->tcp_rg_bind != tcp->tcp_rg_bind); - - err = tcp_rg_insert(ltcp->tcp_rg_bind, tcp); + ASSERT(ltcp->tcp_connp != NULL); + ASSERT(ltcp->tcp_connp->conn_rg_bind != NULL); + ASSERT(connp != NULL); + ASSERT(connp->conn_rg_bind == NULL); + err = conn_rg_insert( + lconnp->conn_rg_bind, connp); if (err != 0) { mutex_exit(&tbf->tf_lock); + *errcode = err; return (0); } - /* - * Now that the newly-binding socket has joined - * the existing reuseport group on ltcp, it - * should clean up its own (empty) group. - */ - rg = tcp->tcp_rg_bind; - tcp->tcp_rg_bind = ltcp->tcp_rg_bind; - VERIFY(tcp_rg_remove(rg, tcp)); - tcp_rg_destroy(rg); + connp->conn_rg_bind = lconnp->conn_rg_bind; } /* @@ -941,6 +938,20 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, ip_xmit_attr_t *, connp->conn_ixa, void, NULL, tcp_t *, tcp, void, NULL, int32_t, TCPS_IDLE); + /* + * If we are the first here and have SO_REUSEPORT set, + * set up connp->conn_rg_bind + */ + if (connp->conn_reuseport && + (connp->conn_rg_bind == NULL)) { + conn_rg_t *rg = conn_rg_init(connp); + if (rg == NULL) { + mutex_exit(&tbf->tf_lock); + *errcode = ENOMEM; + return (0); + } + connp->conn_rg_bind = rg; + } connp->conn_lport = htons(port); @@ -995,124 +1006,3 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, } while (++count < loopmax); return (0); } - -/* Max number of members in TCP SO_REUSEPORT group */ -#define TCP_RG_SIZE_MAX 64 -/* Step size when expanding members array */ -#define TCP_RG_SIZE_STEP 2 - - -tcp_rg_t * -tcp_rg_init(tcp_t *tcp) -{ - tcp_rg_t *rg; - rg = kmem_alloc(sizeof (tcp_rg_t), KM_NOSLEEP_LAZY); - if (rg == NULL) - return (NULL); - rg->tcprg_members = kmem_zalloc(2 * sizeof (tcp_t *), KM_NOSLEEP_LAZY); - if (rg->tcprg_members == NULL) { - kmem_free(rg, sizeof (tcp_rg_t)); - return (NULL); - } - - mutex_init(&rg->tcprg_lock, NULL, MUTEX_DEFAULT, NULL); - rg->tcprg_size = 2; - rg->tcprg_count = 1; - rg->tcprg_active = 1; - rg->tcprg_members[0] = tcp; - return (rg); -} - -void -tcp_rg_destroy(tcp_rg_t *rg) -{ - mutex_enter(&rg->tcprg_lock); - ASSERT(rg->tcprg_count == 0); - ASSERT(rg->tcprg_active == 0); - kmem_free(rg->tcprg_members, rg->tcprg_size * sizeof (tcp_t *)); - mutex_destroy(&rg->tcprg_lock); - kmem_free(rg, sizeof (struct tcp_rg_s)); -} - -static int -tcp_rg_insert(tcp_rg_t *rg, tcp_t *tcp) -{ - mutex_enter(&rg->tcprg_lock); - - VERIFY(rg->tcprg_size > 0); - VERIFY(rg->tcprg_count <= rg->tcprg_size); - if (rg->tcprg_count != 0) { - cred_t *oldcred = rg->tcprg_members[0]->tcp_connp->conn_cred; - cred_t *newcred = tcp->tcp_connp->conn_cred; - - if (crgetuid(oldcred) != crgetuid(newcred) || - crgetzoneid(oldcred) != crgetzoneid(newcred)) { - mutex_exit(&rg->tcprg_lock); - return (EPERM); - } - } - - if (rg->tcprg_count == rg->tcprg_size) { - unsigned int oldalloc = rg->tcprg_size * sizeof (tcp_t *); - unsigned int newsize = rg->tcprg_size + TCP_RG_SIZE_STEP; - tcp_t **newmembers; - - if (newsize > TCP_RG_SIZE_MAX) { - mutex_exit(&rg->tcprg_lock); - return (EINVAL); - } - newmembers = kmem_zalloc(newsize * sizeof (tcp_t *), - KM_NOSLEEP_LAZY); - if (newmembers == NULL) { - mutex_exit(&rg->tcprg_lock); - return (ENOMEM); - } - bcopy(rg->tcprg_members, newmembers, oldalloc); - kmem_free(rg->tcprg_members, oldalloc); - rg->tcprg_members = newmembers; - rg->tcprg_size = newsize; - } - - rg->tcprg_members[rg->tcprg_count] = tcp; - rg->tcprg_count++; - rg->tcprg_active++; - - mutex_exit(&rg->tcprg_lock); - return (0); -} - -boolean_t -tcp_rg_remove(tcp_rg_t *rg, tcp_t *tcp) -{ - int i; - boolean_t is_empty; - - mutex_enter(&rg->tcprg_lock); - for (i = 0; i < rg->tcprg_count; i++) { - if (rg->tcprg_members[i] == tcp) - break; - } - /* The item should be present */ - ASSERT(i < rg->tcprg_count); - /* Move the last member into this position */ - rg->tcprg_count--; - rg->tcprg_members[i] = rg->tcprg_members[rg->tcprg_count]; - rg->tcprg_members[rg->tcprg_count] = NULL; - if (tcp->tcp_connp->conn_reuseport != 0) - rg->tcprg_active--; - is_empty = (rg->tcprg_count == 0); - mutex_exit(&rg->tcprg_lock); - return (is_empty); -} - -void -tcp_rg_setactive(tcp_rg_t *rg, boolean_t is_active) -{ - mutex_enter(&rg->tcprg_lock); - if (is_active) { - rg->tcprg_active++; - } else { - rg->tcprg_active--; - } - mutex_exit(&rg->tcprg_lock); -} diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c index 15e49ae070..e14f178d4f 100644 --- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c +++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c @@ -505,104 +505,6 @@ tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) return (retval); } -/* - * Set a TCP connection's participation in SO_REUSEPORT. This operation is - * performed under the protection of the squeue via tcp_setsockopt. - * The manipulation of tcp_rg_bind, as part of this operation, is subject to - * these constraints: - * 1. Prior to bind(), tcp_rg_bind can be set/cleared in tcp_set_reuseport - * under the protection of the squeue. - * 2. Once the connection has been bound, the tcp_rg_bind pointer must not be - * altered until such time as tcp_free() cleans up the connection. - * 3. A connection undergoing bind, which matches to a connection participating - * in port-reuse, will switch its tcp_rg_bind pointer when it joins the - * group of an existing connection in tcp_bindi(). - */ -static int -tcp_set_reuseport(conn_t *connp, boolean_t do_enable) -{ - tcp_t *tcp = connp->conn_tcp; - struct tcp_rg_s *rg; - - if (!IPCL_IS_NONSTR(connp)) { - if (do_enable) { - /* - * SO_REUSEPORT cannot be enabled on sockets which have - * fallen back to the STREAMS API. - */ - return (EINVAL); - } else { - /* - * A connection with SO_REUSEPORT enabled should be - * prevented from falling back to STREAMS mode via - * logic in tcp_fallback. It is legal, however, for - * fallen-back connections to affirm the disabled state - * of SO_REUSEPORT. - */ - ASSERT(connp->conn_reuseport == 0); - return (0); - } - } - if (tcp->tcp_state <= TCPS_CLOSED) { - return (EINVAL); - } - if (connp->conn_reuseport == 0 && do_enable) { - /* disabled -> enabled */ - if (tcp->tcp_rg_bind != NULL) { - tcp_rg_setactive(tcp->tcp_rg_bind, do_enable); - } else { - /* - * Connection state is not a concern when initially - * populating tcp_rg_bind. Setting it to non-NULL on a - * bound or listening connection would only mean that - * new reused-port binds become a possibility. - */ - if ((rg = tcp_rg_init(tcp)) == NULL) { - return (ENOMEM); - } - tcp->tcp_rg_bind = rg; - } - connp->conn_reuseport = 1; - } else if (connp->conn_reuseport != 0 && !do_enable) { - /* enabled -> disabled */ - ASSERT(tcp->tcp_rg_bind != NULL); - if (tcp->tcp_state == TCPS_IDLE) { - /* - * If the connection has not been bound yet, discard - * the reuse group state. Since disabling SO_REUSEPORT - * on a bound socket will _not_ prevent others from - * reusing the port, the presence of tcp_rg_bind is - * used to determine reuse availability, not - * conn_reuseport. - * - * This allows proper behavior for examples such as: - * - * setsockopt(fd1, ... SO_REUSEPORT, &on_val...); - * bind(fd1, &myaddr, ...); - * setsockopt(fd1, ... SO_REUSEPORT, &off_val...); - * - * setsockopt(fd2, ... SO_REUSEPORT, &on_val...); - * bind(fd2, &myaddr, ...); // <- SHOULD SUCCEED - * - */ - rg = tcp->tcp_rg_bind; - tcp->tcp_rg_bind = NULL; - VERIFY(tcp_rg_remove(rg, tcp)); - tcp_rg_destroy(rg); - } else { - /* - * If a connection has been bound, it's no longer safe - * to manipulate tcp_rg_bind until connection clean-up - * during tcp_free. Just mark the member status of the - * connection as inactive. - */ - tcp_rg_setactive(tcp->tcp_rg_bind, do_enable); - } - connp->conn_reuseport = 0; - } - return (0); -} - /* * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements. * Parameters are assumed to be verified by the caller. @@ -773,11 +675,6 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, } *outlenp = inlen; return (0); - case SO_REUSEPORT: - if (!checkonly) { - return (tcp_set_reuseport(connp, *i1 != 0)); - } - return (0); } break; case IPPROTO_TCP: diff --git a/usr/src/uts/common/inet/tcp/tcp_socket.c b/usr/src/uts/common/inet/tcp/tcp_socket.c index 32422be675..a30aaa9afc 100644 --- a/usr/src/uts/common/inet/tcp/tcp_socket.c +++ b/usr/src/uts/common/inet/tcp/tcp_socket.c @@ -1031,11 +1031,11 @@ tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, /* * Do not allow fallback on connections making use of SO_REUSEPORT. */ - if (tcp->tcp_rg_bind != NULL) { + if (connp->conn_rg_bind != NULL || connp->conn_reuseport) { freeb(stropt_mp); freeb(ordrel_mp); squeue_synch_exit(connp, SQ_NODRAIN); - return (EINVAL); + return (ENXIO); } /* diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h index 61af05f749..2b5333267f 100644 --- a/usr/src/uts/common/inet/tcp_impl.h +++ b/usr/src/uts/common/inet/tcp_impl.h @@ -395,22 +395,6 @@ typedef struct tcp_listen_cnt_s { uint32_t tlc_drop; } tcp_listen_cnt_t; -/* - * Track tcp_t entities bound to the same port/address tuple via SO_REUSEPORT. - * - tcprg_lock: Protects the other fields - * - tcprg_size: Allocated size (in entries) of tcprg_members array - * - tcprg_count: Count of occupied tcprg_members slots - * - tcprg_active: Count of members which still have SO_REUSEPORT set - * - tcprg_members: Connections associated with address/port group - */ -typedef struct tcp_rg_s { - kmutex_t tcprg_lock; - unsigned int tcprg_size; - unsigned int tcprg_count; - unsigned int tcprg_active; - tcp_t **tcprg_members; -} tcp_rg_t; - #define TCP_TLC_REPORT_INTERVAL (30 * MINUTES) #define TCP_DECR_LISTEN_CNT(tcp) \ @@ -708,14 +692,9 @@ extern int tcp_bind_check(conn_t *, struct sockaddr *, socklen_t, extern void tcp_bind_hash_insert(tf_t *, tcp_t *, int); extern void tcp_bind_hash_remove(tcp_t *); extern in_port_t tcp_bindi(tcp_t *, in_port_t, const in6_addr_t *, - int, boolean_t, boolean_t, boolean_t); + int, boolean_t, boolean_t, boolean_t, int *error); extern in_port_t tcp_update_next_port(in_port_t, const tcp_t *, boolean_t); -extern tcp_rg_t *tcp_rg_init(tcp_t *); -extern boolean_t tcp_rg_remove(tcp_rg_t *, tcp_t *); -extern void tcp_rg_destroy(tcp_rg_t *); -extern void tcp_rg_setactive(tcp_rg_t *, boolean_t); - /* * Fusion related functions in tcp_fusion.c. */ diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c index 4e208465f2..d85df1c1ac 100644 --- a/usr/src/uts/common/inet/udp/udp.c +++ b/usr/src/uts/common/inet/udp/udp.c @@ -489,6 +489,14 @@ udp_bind_hash_remove(udp_t *udp, boolean_t caller_holds_lock) ASSERT(lockp != NULL); mutex_enter(lockp); } + + if (connp->conn_rg_bind != NULL) { + if (conn_rg_remove(connp->conn_rg_bind, connp) == 0) { + conn_rg_destroy(connp->conn_rg_bind); + } + connp->conn_rg_bind = NULL; + } + if (udp->udp_ptpbhn != NULL) { udpnext = udp->udp_bind_hash; if (udpnext != NULL) { @@ -5132,25 +5140,28 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, } /* - * If conn_reuseaddr is not set, then we have to make sure that - * the IP address and port number the application requested - * (or we selected for the application) is not being used by - * another stream. If another stream is already using the - * requested IP address and port, the behavior depends on - * "bind_to_req_port_only". If set the bind fails; otherwise we - * search for any unused port to bind to the stream. + * If neither conn_reuseaddr nor conn_reuseport is set, + * then we have to make sure that the IP address and port number + * the application requested (or we selected for the application) + * is not being used by another stream. If another stream is + * already using the requested IP address and port, the behavior + * depends on "bind_to_req_port_only". If set the bind fails; + * otherwise we search for any unused port to bind to the stream. * * As per the BSD semantics, as modified by the Deering multicast * changes, if conn_reuseaddr is set, then we allow multiple binds * to the same port independent of the local IP address. * + * As per the Linux sematics, if conn_reuseport is set, then we + * allow multiple duplicate binds to the same address. + * * This is slightly different than in SunOS 4.X which did not * support IP multicast. Note that the change implemented by the * Deering multicast code effects all binds - not only binding * to IP multicast addresses. * - * Note that when binding to port zero we ignore SO_REUSEADDR in - * order to guarantee a unique port. + * Note that when binding to port zero we ignore SO_REUSEADDR + * or SO_REUSEPORT in order to guarantee a unique port. */ count = 0; @@ -5270,7 +5281,8 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, } if (!found_exclbind && - (connp->conn_reuseaddr && requested_port != 0)) { + (connp->conn_reuseaddr && + !connp->conn_reuseport && requested_port != 0)) { break; } @@ -5279,8 +5291,57 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, * No other stream has this IP address * and port number. We can use it. */ + if (connp->conn_reuseport && + (connp->conn_rg_bind == NULL)) { + /* + * We are the first in this rg group, + * set up conn_rg_bind + */ + conn_rg_t *rg = conn_rg_init(connp); + if (rg == NULL) { + mutex_exit(&udpf->uf_lock); + mutex_exit(&connp->conn_lock); + return (ENOMEM); + } + connp->conn_rg_bind = rg; + } + break; + } + + if (!found_exclbind && + (connp->conn_reuseport && requested_port != 0)) { + /* + * We have SO_REUSEPORT set, so attempt to + * join the existing conn_rg_bind group + */ + ASSERT(udp1 != NULL); + ASSERT(connp1 != NULL); + + int err = 0; + + /* Reject reuse if not set on the first */ + if (connp1->conn_rg_bind == NULL) { + err = -TADDRBUSY; + goto errout; + } + + /* Attemp to join the group */ + conn_rg_t *rg = connp1->conn_rg_bind; + err = conn_rg_insert(rg, connp); + if (err != 0) { + goto errout; + } + connp->conn_rg_bind = rg; break; + + errout: + if (err != 0) { + mutex_exit(&udpf->uf_lock); + mutex_exit(&connp->conn_lock); + return (err); + } } + mutex_exit(&udpf->uf_lock); if (bind_to_req_port_only) { /* @@ -6315,6 +6376,13 @@ udp_fallback(sock_lower_handle_t proto_handle, queue_t *q, stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL); + /* + * Do not allow fallback on connections making use of SO_REUSEPORT. + */ + if (connp->conn_rg_bind != NULL || connp->conn_reuseport) { + return (ENXIO); + } + /* * setup the fallback stream that was allocated */ diff --git a/usr/src/uts/common/inet/udp/udp_opt_data.c b/usr/src/uts/common/inet/udp/udp_opt_data.c index 9c05b8c876..afe575396a 100644 --- a/usr/src/uts/common/inet/udp/udp_opt_data.c +++ b/usr/src/uts/common/inet/udp/udp_opt_data.c @@ -64,6 +64,7 @@ opdes_t udp_opt_arr[] = { }, { SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_REUSEPORT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, { SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, diff --git a/usr/src/uts/common/sys/socket.h b/usr/src/uts/common/sys/socket.h index 25880522e9..83c0c3f6b4 100644 --- a/usr/src/uts/common/sys/socket.h +++ b/usr/src/uts/common/sys/socket.h @@ -142,6 +142,8 @@ typedef void *_RESTRICT_KYWD Psocklen_t; #define SO_OOBINLINE 0x0100 /* leave received OOB data in line */ #define SO_DGRAM_ERRIND 0x0200 /* Application wants delayed error */ #define SO_RECVUCRED 0x0400 /* Application wants ucred of sender */ +#define SO_REUSEPORT 0x2004 /* allow simultaneous port reuse */ + /* * Socket options are passed using a signed integer, but it is also rare -- 2.29.2