xiaochou56 发表于 2021-9-8 14:30

C语言版爬虫(爬取大乐透往期全部中奖号码)

C语言
编译及运行环境:Ubuntu16.04 64位
依赖:libcurl.so.4.6.0 libgumbo.so.1.0.0
运行示例:

爬取数据展示:

附代码:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stdint.h>
#include <curl/curl.h>
#include <include/gumbo.h>

#define DLT_FILE_PATH "dlt.txt"

#define CLOSE   printf("\033[0m");
#define RED   printf("\033[31m");
#define GREEN   printf("\033[36m");
#define YELLOWprintf("\033[33m");
#define BLUE    printf("\033[34m");

typedef struct {
uint8_t *memory;
size_t size;
} MyResp;

char url = "";

char *getURLStr(int32_t start, int32_t end){
    snprintf(url, 256, "http://datachart.500.com/dlt/history/newinc/history.php?start=%d&end=%d&sort=1", start, end);
    return url;
}

static size_t writeMemFunc(void *contents, size_t size, size_t num, void *userData) {

        size_t realsize = size * num;
        MyResp *mem = (MyResp *)userData;

        uint8_t *ptr = realloc(mem->memory, mem->size + realsize + 1);
        if(!ptr) {
          /* out of memory! */
            printf("Not enough memory (realloc returned NULL).\n");
            return 0;
        }

        mem->memory = ptr;
        memcpy(&(mem->memory), contents, realsize);
        mem->size += realsize;
        mem->memory = 0;

        return realsize;
}

void print_data(GumboNode *node,GumboAttribute *attr)
{
    GumboNode *ip=(GumboNode *)(&node->v.element.children)->data;

    char data = "";

    if(strcmp(attr->value,"t_tr1") == 0){
      if(ip->type == GUMBO_NODE_TEXT) {
            if(strstr(ip->v.text.text, "-"))
                sprintf(data, "| %s\n", ip->v.text.text);
      }
    }else if(strcmp(attr->value,"cfont2") == 0){
      if(ip->type == GUMBO_NODE_TEXT && strlen(ip->v.text.text) == 2)
            sprintf(data, "%s ",ip->v.text.text);
    }
    else if(strcmp(attr->value,"cfont4") == 0){
      if(ip->type == GUMBO_NODE_TEXT)
            sprintf(data, "%s ",ip->v.text.text);
    }
    FILE *file = NULL;
    file = fopen(DLT_FILE_PATH, "a+");
    if (file == NULL) return;
    fprintf(file, "%s", data);
    fclose(file);
}

void get_data(GumboNode *node, GumboTag tag)
{
    if(node->type != GUMBO_NODE_ELEMENT) return;
    GumboAttribute *attr;
    if(attr=gumbo_get_attribute(&node->v.element.attributes,"class"))
      print_data(node, attr);

    GumboVector *children = &node->v.element.children;

    if(node->v.element.tag == GUMBO_TAG_DT)
      for(int i=0; i < children->length; ++i)
            get_data(children->data,GUMBO_TAG_DD);
    for(int i=0; i < children->length; ++i)
      get_data(children->data, GUMBO_TAG_DT);
}

int32_t parseResponse(uint8_t *resp, uint32_t resLen)
{
    GumboOutput *output;
    uint32_t len = resLen;

    char *ustr = (char *)malloc(len);
    if(ustr == NULL) return -1;
    memset(ustr, 0, len);
    memcpy(ustr, resp, len);
    output = gumbo_parse(ustr);
    get_data(output->root, GUMBO_TAG_DT);
    gumbo_destroy_output(&kGumboDefaultOptions, output);
    free(ustr);
    ustr = NULL;

    return 0;
}

int32_t httpRequest(int32_t start, int32_t end)
{
    CURL *curl = curl_easy_init();
    if(curl == NULL){
      printf("curl init failed.\n");
      return -1;
    }

    struct curl_slist *headers = curl_slist_append(NULL, "User-Agent:Linux");
    curl_slist_append(headers, "Connection:keep-alive");
    curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);

    MyResp resBuff;
    memset(&resBuff, 0, sizeof(MyResp));
    curl_easy_setopt(curl, CURLOPT_URL, getURLStr(start, end));
    curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writeMemFunc);
    curl_easy_setopt(curl, CURLOPT_WRITEDATA, &resBuff);

    curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
    CURLcode ret;
    ret = curl_easy_perform(curl);
    curl_slist_free_all(headers);
    if (ret != CURLE_OK) {
      free(resBuff.memory);
      curl_easy_cleanup(curl);
      printf("curl_easy_perform failed.Ret:%d", ret);
      return -1;
    }

    long info;
    curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &info);
    curl_easy_cleanup(curl);
    RED
    printf("RESPONSE_CODE: %ld RESP_SIZE: %lu\n", info, resBuff.size);
    CLOSE
    ret = parseResponse(resBuff.memory, resBuff.size);
    free(resBuff.memory);

    return 0;
}

int main(int argc, char *argv[])
{
    remove(DLT_FILE_PATH);

    httpRequest(7001, 70001);

    return 0;
}

whathell 发表于 2021-9-8 16:19

都是人工控制的,{:1_905:}

wanghao6912 发表于 2021-9-8 16:42

whathell 发表于 2021-9-8 16:19
都是人工控制的,

说啥大实话

刘伟坤 发表于 2021-9-8 17:03

whathell 发表于 2021-9-8 16:19
都是人工控制的,

净说大实话

mckof 发表于 2021-9-8 17:10

当做学习交流就好了,除非兄台找到必胜的办法

吾爱福利 发表于 2021-9-8 17:13

哈哈哈 二等奖以下的小奖适当放一点

alien0774 发表于 2021-9-8 17:14

再来个大数据选号

SomerHalder 发表于 2021-9-8 17:38

吾爱福利 发表于 2021-9-8 17:13
哈哈哈 二等奖以下的小奖适当放一点

放也不是普通人三天两头能碰到的 太真实了哈哈哈哈

转一圈 发表于 2021-10-20 09:47

感谢分享{:1_937:}
页: [1]
查看完整版本: C语言版爬虫(爬取大乐透往期全部中奖号码)