@@ -518,7 +518,8 @@ static int tap_open(struct inode *inode, struct file *file)
goto err;
}
- init_waitqueue_head(&q->sock.wq.wait);
+ RCU_INIT_POINTER(q->sock.wq, &q->wq);
+ init_waitqueue_head(&q->wq.wait);
q->sock.type = SOCK_RAW;
q->sock.state = SS_CONNECTED;
q->sock.file = file;
@@ -576,7 +577,7 @@ static __poll_t tap_poll(struct file *file, poll_table *wait)
goto out;
mask = 0;
- poll_wait(file, &q->sock.wq.wait, wait);
+ poll_wait(file, &q->wq.wait, wait);
if (!ptr_ring_empty(&q->ring))
mask |= EPOLLIN | EPOLLRDNORM;
@@ -160,6 +160,7 @@ struct tun_pcpu_stats {
struct tun_file {
struct sock sk;
struct socket socket;
+ struct socket_wq wq;
struct tun_struct __rcu *tun;
struct fasync_struct *fasync;
/* only used for fasnyc */
@@ -2173,7 +2174,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
goto out;
}
- add_wait_queue(&tfile->socket.wq.wait, &wait);
+ add_wait_queue(&tfile->wq.wait, &wait);
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
@@ -2193,7 +2194,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
}
__set_current_state(TASK_RUNNING);
- remove_wait_queue(&tfile->socket.wq.wait, &wait);
+ remove_wait_queue(&tfile->wq.wait, &wait);
out:
*err = error;
@@ -3434,7 +3435,8 @@ static int tun_chr_open(struct inode *inode, struct file * file)
tfile->flags = 0;
tfile->ifindex = 0;
- init_waitqueue_head(&tfile->socket.wq.wait);
+ init_waitqueue_head(&tfile->wq.wait);
+ RCU_INIT_POINTER(tfile->socket.wq, &tfile->wq);
tfile->socket.file = file;
tfile->socket.ops = &tun_socket_ops;
@@ -62,6 +62,7 @@ struct tap_dev {
struct tap_queue {
struct sock sk;
struct socket sock;
+ struct socket_wq wq;
int vnet_hdr_sz;
struct tap_dev __rcu *tap;
struct file *file;
@@ -116,11 +116,11 @@ struct socket {
unsigned long flags;
+ struct socket_wq *wq;
+
struct file *file;
struct sock *sk;
const struct proto_ops *ops;
-
- struct socket_wq wq;
};
struct vm_area_struct;
@@ -1841,7 +1841,7 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
{
WARN_ON(parent->sk);
write_lock_bh(&sk->sk_callback_lock);
- rcu_assign_pointer(sk->sk_wq, &parent->wq);
+ rcu_assign_pointer(sk->sk_wq, parent->wq);
parent->sk = sk;
sk_set_socket(sk, parent);
sk->sk_uid = SOCK_INODE(parent)->i_uid;
@@ -2119,7 +2119,7 @@ static inline void sock_poll_wait(struct file *filp, struct socket *sock,
poll_table *p)
{
if (!poll_does_not_wait(p)) {
- poll_wait(filp, &sock->wq.wait, p);
+ poll_wait(filp, &sock->wq->wait, p);
/* We need to be sure we are in sync with the
* socket flags modification.
*
@@ -2869,7 +2869,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
if (sock) {
sk->sk_type = sock->type;
- RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
+ RCU_INIT_POINTER(sk->sk_wq, sock->wq);
sock->sk = sk;
sk->sk_uid = SOCK_INODE(sock)->i_uid;
} else {
@@ -249,13 +249,20 @@ static struct kmem_cache *sock_inode_cachep __ro_after_init;
static struct inode *sock_alloc_inode(struct super_block *sb)
{
struct socket_alloc *ei;
+ struct socket_wq *wq;
ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
- init_waitqueue_head(&ei->socket.wq.wait);
- ei->socket.wq.fasync_list = NULL;
- ei->socket.wq.flags = 0;
+ wq = kmalloc(sizeof(*wq), GFP_KERNEL);
+ if (!wq) {
+ kmem_cache_free(sock_inode_cachep, ei);
+ return NULL;
+ }
+ init_waitqueue_head(&wq->wait);
+ wq->fasync_list = NULL;
+ wq->flags = 0;
+ ei->socket.wq = wq;
ei->socket.state = SS_UNCONNECTED;
ei->socket.flags = 0;
@@ -271,6 +278,7 @@ static void sock_free_inode(struct inode *inode)
struct socket_alloc *ei;
ei = container_of(inode, struct socket_alloc, vfs_inode);
+ kfree(ei->socket.wq);
kmem_cache_free(sock_inode_cachep, ei);
}
@@ -610,7 +618,7 @@ static void __sock_release(struct socket *sock, struct inode *inode)
module_put(owner);
}
- if (sock->wq.fasync_list)
+ if (sock->wq->fasync_list)
pr_err("%s: fasync list not empty!\n", __func__);
if (!sock->file) {
@@ -1299,12 +1307,13 @@ static int sock_fasync(int fd, struct file *filp, int on)
{
struct socket *sock = filp->private_data;
struct sock *sk = sock->sk;
- struct socket_wq *wq = &sock->wq;
+ struct socket_wq *wq;
if (sk == NULL)
return -EINVAL;
lock_sock(sk);
+ wq = sock->wq;
fasync_helper(fd, filp, on, &wq->fasync_list);
if (!wq->fasync_list)
From: SeongJae Park <sjpark@amazon.de> This reverts commit 333f7909a8573145811c4ab7d8c9092301707721. The commit 6d7855c54e1e ("sockfs: switch to ->free_inode()") made the deallocation of 'socket_alloc' to be done asynchronously using RCU, as same to 'sock.wq'. And the following commit 333f7909a857 ("coallocate socket_sq with socket itself") made those to have same life cycle. The changes made the code much more simple, but also made 'socket_alloc' live longer than before. For the reason, user programs intensively repeating allocations and deallocations of sockets could cause memory pressure on recent kernels. To avoid the problem, this commit separates the life cycle of 'socket_alloc' and 'sock.wq' again. The following commit will make the deallocation of 'socket_alloc' to be done synchronously again. --- drivers/net/tap.c | 5 +++-- drivers/net/tun.c | 8 +++++--- include/linux/if_tap.h | 1 + include/linux/net.h | 4 ++-- include/net/sock.h | 4 ++-- net/core/sock.c | 2 +- net/socket.c | 19 ++++++++++++++----- 7 files changed, 28 insertions(+), 15 deletions(-)