连接到 Web 服务器时连接被拒绝

connection refused when connecting to web server

我正在尝试连接到网络服务器以便抓取它,但我的程序给出了错误:来自连接函数的“连接被拒绝”。这是代码:

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <pthread.h>
#include <errno.h>
#include <netdb.h>
#include <arpa/inet.h>
#include <regex.h>

#define MAXLINE 4096

int main() {
    char domain[255];
    char netAddr[255];
    char GETrqst[MAXLINE];
    char recvLine[MAXLINE];
    int rl;

    printf("webscraper\n\n");

    printf("enter the domain of the website you want to scrape (eg: google.com): ");
    scanf("%s", domain);

    //address struct
    //2 structs for the getaddrinfo function
    struct addrinfo getResult, *getResult2;
    memset(&getResult, 0, sizeof(getResult));
    getResult.ai_family = AF_INET;
    getResult.ai_socktype = SOCK_STREAM;


    int addrErr = getaddrinfo(domain, NULL, &getResult, &getResult2);

    if (addrErr != 0) {
        perror("getaddrinfo");
        exit(1);
    }

        //first half of ip
    if (inet_ntop(AF_INET, getResult2->ai_addr->sa_data, netAddr, 255)) {
        //second half of ip
        void *addr_in = &((struct sockaddr_in *) getResult2->ai_addr)->sin_addr;
        inet_ntop(AF_INET, addr_in, netAddr, 255);

        printf("ip for domain is: %s\n", netAddr);
    } else {
        perror("inet_ntop");
        exit(1);
    }

    struct sockaddr_in socket_address;
    //memset(&socket_address, 0, sizeof(socket_address));
    socket_address.sin_family = AF_INET;
    socket_address.sin_port = htons(80);

    int atonErr = inet_aton(getResult2->ai_addr->sa_data, &socket_address.sin_addr);

    if (atonErr != 0) {
        perror("inet_aton");
        exit(1);
    } else {
        printf("successfully converted string to struct\n");
    }

    freeaddrinfo(getResult2);

    printf("address struct setup\n");

    sleep(1);

    printf("setting up socket\n");

    int sock = socket(AF_INET, SOCK_STREAM, 0);

    sleep(1);

    printf("attempting connection to web server...\n");
    int connectStatus = connect(sock, (struct sockaddr *) &socket_address, sizeof(socket_address));

    if (connectStatus == -1) {
        perror("connect error");
        exit(1);
    } else {
        printf("connection success!\n");
    }

    sleep(1);
    //sets get command to get webpage. \r\n\r\n is at the end of every webpage url
    sprintf(GETrqst, "GET / HTTP/1.1\r\n\r\n");

    write(sock, GETrqst, sizeof(GETrqst));

    memset(recvLine, 0, MAXLINE);
    
    while ((rl = read(sock, recvLine, MAXLINE-1)) > 0 ) {
        printf("%s", recvLine);
    }

    exit(1);

    return 0;
} 

我试过使用 memset() 来重置 sockaddr_in 结构,因为它以前有效,但现在不起作用。我还检查了 ip 是否被正确保存和使用,但这些选项中的 none 有效。我很迷茫,我一直在尝试解决这个问题大约 2 个小时。如有任何帮助,我们将不胜感激!

我做了一些更改以使其成功发出 HTTP 请求:

  • 向 HTTP 1.1 请求添加了 Host: 字段。不提供该字段是错误的。
  • 我让它为 GETrqst 发送了适当数量的字节。应该是strlen(GETrqst),不是sizeof(GETrqst).
  • 您在 inet_aton(getResult2->ai_addr->sa_data, &socket_address.sin_addr); 中将错误的内容复制到 socket_address.sin_addr 中。 ((struct sockaddr_in *) getResult2->ai_addr)->sin_addr 中的内容正确,只需复制即可。

随着我的改变:

#define _DEFAULT_SOURCE
#include <errno.h>
#include <netdb.h>
#include <pthread.h>
#include <regex.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <arpa/inet.h>
#include <netinet/in.h>
#include <sys/socket.h>
#include <sys/types.h>

#define MAXLINE 4096

int main() {
    char domain[255];
    char GETrqst[MAXLINE];
    char recvLine[MAXLINE];

    printf("webscraper\n\n");

    printf("enter the domain of the website you want to scrape (eg: google.com): ");
    if(scanf("%254s", domain) != 1) return 1;

    // address struct
    // 2 structs for the getaddrinfo function
    struct addrinfo getResult = {0}, *getResult2;
    getResult.ai_family = AF_INET;
    getResult.ai_socktype = SOCK_STREAM;
    getResult.ai_protocol = IPPROTO_TCP;

    int addrErr = getaddrinfo(domain, NULL, &getResult, &getResult2);

    if(addrErr != 0) {
        perror("getaddrinfo");
        exit(1);
    }

    printf("attempting connection to web server...\n");

    int sock;
    struct addrinfo *rp;
    // loop through the possible addresses
    for (rp = getResult2; rp != NULL; rp = rp->ai_next) {
        printf("%d %d %d\n", rp->ai_family, rp->ai_socktype, rp->ai_protocol);
        sock = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol);
        if (sock == -1) {
            perror("socket");
            continue;
        }
        printf("socket created - connecting ...\n");

        /* if you want to show the IP
        char netAddr[255];
        void* addr_in = &((struct sockaddr_in*)rp->ai_addr)->sin_addr;
        inet_ntop(AF_INET, addr_in, netAddr, sizeof netAddr);
        printf("IP: %s\n", netAddr);
        */

        struct sockaddr_in socket_address;
        socket_address.sin_family = rp->ai_family; // AF_INET;
        socket_address.sin_port = htons(80);
        socket_address.sin_addr = ((struct sockaddr_in*)rp->ai_addr)->sin_addr;

        if (connect(sock, (const struct sockaddr*) &socket_address,
                          sizeof socket_address) != -1) 
        {
            // connection success
            break;
        }
        perror("connect");

        close(sock);
    }

    freeaddrinfo(getResult2);

    if(rp == NULL) {
        printf("failed to connect\n");
        return 1;
    }

    sleep(1);

    printf("connection success!\n");

    sleep(1);

    sprintf(GETrqst, "GET / HTTP/1.1\r\nHost: %s:80\r\n\r\n", domain);

    write(sock, GETrqst, strlen(GETrqst));

    ssize_t rl;
    while((rl = read(sock, recvLine, sizeof recvLine - 1)) > 0) {
        recvLine[rl] = '[=10=]';
        fputs(recvLine, stdout);
    }
}

注意:现在大多数网站都使用 HTTPS,因此,除非您知道您尝试抓取的网站可通过纯 HTTP 访问,否则您可能还需要添加加密。如果您向 google.com:80 发出 HTTP 请求,它会回答如下内容:

HTTP/1.1 301 Moved Permanently
Location: http://www.google.com/
Content-Type: text/html; charset=UTF-8
Date: Tue, 07 Sep 2021 18:57:22 GMT
Expires: Thu, 07 Oct 2021 18:57:22 GMT
Cache-Control: public, max-age=2592000
Server: gws
Content-Length: 219
X-XSS-Protection: 0
X-Frame-Options: SAMEORIGIN

<HTML><HEAD><meta http-equiv="content-type" content="text/html;charset=utf-8">
<TITLE>301 Moved</TITLE></HEAD><BODY>
<H1>301 Moved</H1>
The document has moved
<A HREF="http://www.google.com/">here</A>.
</BODY></HTML>

正如我在热门评论中提到的,您的应用确实 连接到该网站。但是,它喜欢您的 HTTP 请求并返回了 HTTP 错误 400。

但是,那是不正确的。它连接到 a 网站,但 不是 想要的网站。

有两个问题。

  1. sin_addr 在您的 connect 调用中全为零(即 0.0.0.0
  2. 您的 HTTP 请求不完整。

您的代码:

inet_aton(getResult2->ai_addr->sa_data, &socket_address.sin_addr);

正在存储所有零。

我将其注释掉,并根据您的 inet_ntop 电话进行了 memcpy

if (inet_ntop(AF_INET, getResult2->ai_addr->sa_data, netAddr, 255)) {
    // second half of ip
    void *addr_in = &((struct sockaddr_in *) getResult2->ai_addr)->sin_addr;

    inet_ntop(AF_INET, addr_in, netAddr, 255);

    printf("%sip for domain is: %s\n", T_GREEN, netAddr);

    memcpy(&socket_address.sin_addr,addr_in,
        sizeof(socket_address.sin_addr));
}

您的请求不完整。它需要一个 Host: 字段。

那么,您要发送:

GET / HTTP/1.1\r\n
Host: <url_of_server>\r\n
\r\n

这是我想出的最终代码:

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <pthread.h>
#include <errno.h>
#include <netdb.h>
#include <arpa/inet.h>
#include <regex.h>
//#include "tColor copy.h"

#define MAXLINE 4096

#define T_RED       "RED: "
#define T_GREEN     "GREEN: "
#define T_BLUE      "BLUE: "

#define SENDLINE(_fmt...) \
    do { \
        buflen = sprintf(buf, _fmt); \
        printf("SENDLINE: %s\n",buf); \
        strcpy(&buf[buflen],"\r\n"); \
        write(sock, buf, buflen + 2); \
    } while (0)

char buf[MAXLINE];
size_t buflen;

int
main(int argc,char **argv)
{
    char domain[255];
    char netAddr[255];
    size_t Getlen;
    char GETrqst[MAXLINE];
    char recvLine[MAXLINE];
    int rl;

    --argc;
    ++argv;

    printf("%swebscraper\n\n", T_GREEN);

    if (argc > 0) {
        strcpy(domain,argv[0]);
        printf("domain is '%s'\n",domain);
    }
    else {
        printf("%senter the domain of the website you want to scrape (eg: google.com): ", T_BLUE);
        scanf("%s", domain);
    }

    // address struct
    // 2 structs for the getaddrinfo function
    struct addrinfo getResult,
    *getResult2;

    memset(&getResult, 0, sizeof(getResult));
    getResult.ai_family = AF_INET;
    getResult.ai_socktype = SOCK_STREAM;

    int addrErr = getaddrinfo(domain, NULL, &getResult, &getResult2);

    if (addrErr != 0) {
        printf("%s", T_RED);
        perror("getaddrinfo");
        exit(1);
    }

    struct sockaddr_in socket_address;

    // first half of ip
    if (inet_ntop(AF_INET, getResult2->ai_addr->sa_data, netAddr, 255)) {
        // second half of ip
        void *addr_in = &((struct sockaddr_in *) getResult2->ai_addr)->sin_addr;

        inet_ntop(AF_INET, addr_in, netAddr, 255);

        printf("%sip for domain is: %s\n", T_GREEN, netAddr);

        memcpy(&socket_address.sin_addr,addr_in,
            sizeof(socket_address.sin_addr));
    }
    else {
        printf("%s", T_RED);
        perror("inet_ntop");
        exit(1);
    }

    // memset(&socket_address, 0, sizeof(socket_address));
    socket_address.sin_family = AF_INET;
    socket_address.sin_port = htons(80);

#if 0
    int atonErr = inet_aton(getResult2->ai_addr->sa_data, &socket_address.sin_addr);

    if (atonErr != 0) {
        printf("%s", T_RED);
        perror("inet_aton");
        exit(1);
    }
    else {
        printf("%ssuccessfully converted string to struct\n", T_GREEN);
    }
#endif

    freeaddrinfo(getResult2);

    printf("%saddress struct setup\n", T_GREEN);

    sleep(1);

    printf("%ssetting up socket\n", T_BLUE);

    int sock = socket(AF_INET, SOCK_STREAM, 0);

    sleep(1);

    printf("%sattempting connection to web server...\n", T_BLUE);
    int connectStatus = connect(sock,
        (struct sockaddr *) &socket_address, sizeof(socket_address));

    if (connectStatus == -1) {
        printf("%s", T_RED);
        perror("connect error");
        exit(1);
    }
    else {
        printf("%sconnection success!\n", T_GREEN);
    }

    //sleep(1);
    // sets get command to get webpage. \r\n\r\n is at the end of every webpage url
    SENDLINE("GET / HTTP/1.1");
    //SENDLINE("Host: %s",netAddr);
    SENDLINE("Host: %s",domain);
    SENDLINE("Accept: */*");
    //SENDLINE("Connection: keep-alive");
    SENDLINE("");

    memset(recvLine, 0, MAXLINE);

    while ((rl = read(sock, recvLine, MAXLINE - 1)) > 0) {
        //printf("%s", recvLine);
        fwrite(recvLine,1,rl,stdout);
    }

    exit(1);

    return 0;
}