通过 netlink 从内核向用户 space 单播数据失败
Failure while unicast data from kernel to user space via netlink
我是内核开发的新手,在通过 netlink 套接字将数据从内核 space 单向投射到用户 space 时遇到了问题。 send_nat() 函数将从内核模块调用以将用户定义的 struct nat_mntr 写入 netlink 套接字.但是 nlmsg_unicast() 一直失败,即使我在 cfg 中尝试了不同的配置。请帮助我找出代码中的错误。
userdefined.c
int no_data_request = 1;
EXPORT_SYMBOL(no_data_request);
int request_pid = 0; // PID of requesting process
EXPORT_SYMBOL(request_pid);
void send_nat(struct sock *nl_sk, struct nat_mntr *nat_data, int pid, int group, gfp_t flags, int *sock_closed){
struct nlmsghdr *nlh;
struct sk_buff *skb_out;
int msg_size;
int res = 200;
#define MYPROTO 31
printk(KERN_DEBUG "%s: Entered \n", __FUNCTION__);
if ( nat_data == NULL ){
printk(KERN_DEBUG "%s: nat_data is NULL: Leaving \n", __FUNCTION__);
return ;
}else {
printk(KERN_DEBUG "%s: nat_data is filled \n", __FUNCTION__);
}
if (nl_sk == NULL) {
printk(KERN_DEBUG "%s: nl_sk is NULL \n", __FUNCTION__);
}else
printk(KERN_DEBUG "%s: nl_sock is not null \n", __FUNCTION__);
if (*sock_closed == 1) {
printk(KERN_DEBUG "%s: sock_closed == 1, creating socket \n", __FUNCTION__);
struct netlink_kernel_cfg cfg = {
.groups = 1,
.input = rr,
};
nl_sk = netlink_kernel_create(&init_net, MYPROTO, &cfg);
if (!nl_sk) {
printk(KERN_DEBUG "%s: Error creating socket: sock_closed = %d: Leaving \n", __FUNCTION__ , *sock_closed);
return ;
}
else {
*sock_closed = 0;
printk(KERN_DEBUG "%s: Socket created successfully: sock_closed = %d \n", __FUNCTION__ , *sock_closed );
}
}
else if (*sock_closed == 0 ){
printk(KERN_DEBUG "%s:Already created socket. sock_closed = 0 \n", __FUNCTION__);
}
else {
printk(KERN_DEBUG "%s: sock_closed status is unknown: sock_closed = %d Leaving \n", __FUNCTION__ , *sock_closed);
return ;
}
if( no_data_request){
printk(KERN_DEBUG "%s: No one has requested data: Leaving \n", __FUNCTION__);
return ;
}else{
printk(KERN_DEBUG "%s: Process %d requested the data\n", __FUNCTION__, request_pid );
}
msg_size = sizeof(struct nat_mntr);
skb_out = nlmsg_new(msg_size, 0);
if ( !skb_out ) {
printk(KERN_DEBUG "%s: Failed to skb_out = nlmsg_new(msg_size, 0): Leaving \n", __FUNCTION__);
return;
}
nlh = nlmsg_put(skb_out, 0, 0, NLMSG_DONE, msg_size, 0); /* NLMSG_DONE */
NETLINK_CB(skb_out).dst_group = 0; /* not in mcast group */
if (!nlh) {
printk(KERN_DEBUG "%s: Failed nlh = nlmsg_put(skb_out, 0, 1, NLMSG_DONE, msg_size, 0): Leaving \n", __FUNCTION__);
return ;
}
else {
printk(KERN_DEBUG "%s: Successfull nlh = nlmsg_put(skb_out, 0, 0, NLMSG_DONE, msg_size, 0) \n", __FUNCTION__);
}
if(memcpy(nlmsg_data(nlh), nat_data , sizeof(nat_data) ) == NULL ) {
printk(KERN_DEBUG "%s: Failed to memcpy(nlmsg_data(nlh), nat_data , sizeof(struct nat_mntr)) Leaving \n", __FUNCTION__);
return ;
}
res = nlmsg_unicast(nl_sk, skb_out2, request_pid);
if (res < 0 ){
printk(KERN_DEBUG "%s: Failed to nlmsg_unicast(nl_sk, skb_out, request_pid): Leaving \n", __FUNCTION__);
return ;
}
printk(KERN_DEBUG "%s: Data sent successfully : Leaving \n", __FUNCTION__);
}
// Callback of kernel socket.
void rr(struct sk_buff *skb){
printk(KERN_DEBUG "%s: Entered \n", __FUNCTION__);
struct nlmsghdr *nlh;
nlh = (struct nlmsghdr *)skb->data;
printk(KERN_DEBUG "Request received \n");
request_pid = nlh->nlmsg_pid; /* pid of sending process */
no_data_request = 0; // Someone is out there
printk(KERN_DEBUG "%s: Leaving:\n", __FUNCTION__);
}
kernel_module.c
#define NAT_GROUP 21
struct sock *nl_sk_ud = NULL;
EXPORT_SYMBOL(nl_sk_ud);
int sock_closed = 1;
EXPORT_SYMBOL(sock_closed);
struct nat_mntr *data = NULL;
EXPORT_SYMBOL(data);
any_kernel_function(){
....
data = get_info(skb, 0, l3proto, l4proto, &target, mtype); // Returns pointer to struct nat_mntr
send_nat(nl_sk_ud, data, 0, NAT_GROUP, 0, &sock_closed);
....
}
如果您的内核模块写入一个答案,那么用户空间请求将收到两个响应:一个 ACK(由内核自动制作)和实际响应。
我认为人们不会注意到这一点,因为通常内核模块会在 ACK 之前快速响应。因此,用户空间客户端收到答案并忽略接下来的任何内容(包括 ACK),直到下一个请求。
在您的代码中,内核模块没有立即响应。它会等到数据可用,然后再获取数据。这可能会发生:
- 用户空间客户端发送请求。
- 内核模块存储 pid。
- Linux 向客户端回复 ACK。
- 客户端收到应答,错误地将其解析为伪造的 nat_mntr 而不是 ACK,并关闭套接字。
- 内核模块一旦有了数据就发送应答。 nlmsg_unicast() returns 错误代码 -111 因为客户端不再监听。
解决此问题的一种方法是让客户端期待两个数据包 - 并忽略第一个数据包,即 ACK。
顺便说一句:这不是您的代码的唯一问题。
当你这样做时
nl_sk = netlink_kernel_create(&init_net, MYPROTO, &cfg);
您正在将套接字分配给局部变量。如果再次调用该函数,即使 sock_closed
为 0,nl_sk will
也不会被初始化。
- 您永远不会释放套接字。因为您不能在内核空间中以相同的协议打开两个套接字,所以后续的套接字创建将无法挽回地失败,直到您重新启动。 (例如,如果您需要重新编译,这将影响您。)
- 代码很活泼。至少,
no_data_request
和 request_pid
应该是原子整数。
不要这样做:
nlh = (struct nlmsghdr *)skb->data;
这个比较好:
nlh = nlmsg_hdr(skb);
这甚至更好(因为它为您做了一些验证和 Netlink 文书工作):
netlink_rcv_skb(skb, &rr2);
我是内核开发的新手,在通过 netlink 套接字将数据从内核 space 单向投射到用户 space 时遇到了问题。 send_nat() 函数将从内核模块调用以将用户定义的 struct nat_mntr 写入 netlink 套接字.但是 nlmsg_unicast() 一直失败,即使我在 cfg 中尝试了不同的配置。请帮助我找出代码中的错误。
userdefined.c
int no_data_request = 1;
EXPORT_SYMBOL(no_data_request);
int request_pid = 0; // PID of requesting process
EXPORT_SYMBOL(request_pid);
void send_nat(struct sock *nl_sk, struct nat_mntr *nat_data, int pid, int group, gfp_t flags, int *sock_closed){
struct nlmsghdr *nlh;
struct sk_buff *skb_out;
int msg_size;
int res = 200;
#define MYPROTO 31
printk(KERN_DEBUG "%s: Entered \n", __FUNCTION__);
if ( nat_data == NULL ){
printk(KERN_DEBUG "%s: nat_data is NULL: Leaving \n", __FUNCTION__);
return ;
}else {
printk(KERN_DEBUG "%s: nat_data is filled \n", __FUNCTION__);
}
if (nl_sk == NULL) {
printk(KERN_DEBUG "%s: nl_sk is NULL \n", __FUNCTION__);
}else
printk(KERN_DEBUG "%s: nl_sock is not null \n", __FUNCTION__);
if (*sock_closed == 1) {
printk(KERN_DEBUG "%s: sock_closed == 1, creating socket \n", __FUNCTION__);
struct netlink_kernel_cfg cfg = {
.groups = 1,
.input = rr,
};
nl_sk = netlink_kernel_create(&init_net, MYPROTO, &cfg);
if (!nl_sk) {
printk(KERN_DEBUG "%s: Error creating socket: sock_closed = %d: Leaving \n", __FUNCTION__ , *sock_closed);
return ;
}
else {
*sock_closed = 0;
printk(KERN_DEBUG "%s: Socket created successfully: sock_closed = %d \n", __FUNCTION__ , *sock_closed );
}
}
else if (*sock_closed == 0 ){
printk(KERN_DEBUG "%s:Already created socket. sock_closed = 0 \n", __FUNCTION__);
}
else {
printk(KERN_DEBUG "%s: sock_closed status is unknown: sock_closed = %d Leaving \n", __FUNCTION__ , *sock_closed);
return ;
}
if( no_data_request){
printk(KERN_DEBUG "%s: No one has requested data: Leaving \n", __FUNCTION__);
return ;
}else{
printk(KERN_DEBUG "%s: Process %d requested the data\n", __FUNCTION__, request_pid );
}
msg_size = sizeof(struct nat_mntr);
skb_out = nlmsg_new(msg_size, 0);
if ( !skb_out ) {
printk(KERN_DEBUG "%s: Failed to skb_out = nlmsg_new(msg_size, 0): Leaving \n", __FUNCTION__);
return;
}
nlh = nlmsg_put(skb_out, 0, 0, NLMSG_DONE, msg_size, 0); /* NLMSG_DONE */
NETLINK_CB(skb_out).dst_group = 0; /* not in mcast group */
if (!nlh) {
printk(KERN_DEBUG "%s: Failed nlh = nlmsg_put(skb_out, 0, 1, NLMSG_DONE, msg_size, 0): Leaving \n", __FUNCTION__);
return ;
}
else {
printk(KERN_DEBUG "%s: Successfull nlh = nlmsg_put(skb_out, 0, 0, NLMSG_DONE, msg_size, 0) \n", __FUNCTION__);
}
if(memcpy(nlmsg_data(nlh), nat_data , sizeof(nat_data) ) == NULL ) {
printk(KERN_DEBUG "%s: Failed to memcpy(nlmsg_data(nlh), nat_data , sizeof(struct nat_mntr)) Leaving \n", __FUNCTION__);
return ;
}
res = nlmsg_unicast(nl_sk, skb_out2, request_pid);
if (res < 0 ){
printk(KERN_DEBUG "%s: Failed to nlmsg_unicast(nl_sk, skb_out, request_pid): Leaving \n", __FUNCTION__);
return ;
}
printk(KERN_DEBUG "%s: Data sent successfully : Leaving \n", __FUNCTION__);
}
// Callback of kernel socket.
void rr(struct sk_buff *skb){
printk(KERN_DEBUG "%s: Entered \n", __FUNCTION__);
struct nlmsghdr *nlh;
nlh = (struct nlmsghdr *)skb->data;
printk(KERN_DEBUG "Request received \n");
request_pid = nlh->nlmsg_pid; /* pid of sending process */
no_data_request = 0; // Someone is out there
printk(KERN_DEBUG "%s: Leaving:\n", __FUNCTION__);
}
kernel_module.c
#define NAT_GROUP 21
struct sock *nl_sk_ud = NULL;
EXPORT_SYMBOL(nl_sk_ud);
int sock_closed = 1;
EXPORT_SYMBOL(sock_closed);
struct nat_mntr *data = NULL;
EXPORT_SYMBOL(data);
any_kernel_function(){
....
data = get_info(skb, 0, l3proto, l4proto, &target, mtype); // Returns pointer to struct nat_mntr
send_nat(nl_sk_ud, data, 0, NAT_GROUP, 0, &sock_closed);
....
}
如果您的内核模块写入一个答案,那么用户空间请求将收到两个响应:一个 ACK(由内核自动制作)和实际响应。
我认为人们不会注意到这一点,因为通常内核模块会在 ACK 之前快速响应。因此,用户空间客户端收到答案并忽略接下来的任何内容(包括 ACK),直到下一个请求。
在您的代码中,内核模块没有立即响应。它会等到数据可用,然后再获取数据。这可能会发生:
- 用户空间客户端发送请求。
- 内核模块存储 pid。
- Linux 向客户端回复 ACK。
- 客户端收到应答,错误地将其解析为伪造的 nat_mntr 而不是 ACK,并关闭套接字。
- 内核模块一旦有了数据就发送应答。 nlmsg_unicast() returns 错误代码 -111 因为客户端不再监听。
解决此问题的一种方法是让客户端期待两个数据包 - 并忽略第一个数据包,即 ACK。
顺便说一句:这不是您的代码的唯一问题。
当你这样做时
nl_sk = netlink_kernel_create(&init_net, MYPROTO, &cfg);
您正在将套接字分配给局部变量。如果再次调用该函数,即使 sock_closed
为 0,nl_sk will
也不会被初始化。
- 您永远不会释放套接字。因为您不能在内核空间中以相同的协议打开两个套接字,所以后续的套接字创建将无法挽回地失败,直到您重新启动。 (例如,如果您需要重新编译,这将影响您。)
- 代码很活泼。至少,
no_data_request
和request_pid
应该是原子整数。 不要这样做:
nlh = (struct nlmsghdr *)skb->data;
这个比较好:
nlh = nlmsg_hdr(skb);
这甚至更好(因为它为您做了一些验证和 Netlink 文书工作):
netlink_rcv_skb(skb, &rr2);