吾爱破解 - 52pojie.cn

 找回密码
 注册[Register]

QQ登录

只需一步,快速开始

查看: 7513|回复: 28
上一主题 下一主题
收起左侧

[Python 转载] 通过Python来自动化爬取京东数据(商品信息、商品评价标签、商品评价内容))

  [复制链接]
跳转到指定楼层
楼主
QingYi. 发表于 2021-6-19 22:06 回帖奖励
需要用到的东西有 PyMySQL : https://github.com/PyMySQL/PyMySQL
                      还有peewee  :https://github.com/coleifer/peewee

上面的内容需要自行安装,都有安装文档,极其简单。

然后先要运行这段代码,去在数据库中生成表

数据是什么意思都写在里面了


[Python] 纯文本查看 复制代码
from peewee import *

db = MySQLDatabase('jd', host='localhost', port=3306, user="root", password="sa")


class BaseModel(Model):
    class Meta:
        database = db


'''
char has set the max length
can set Text if can't ensure the length
'''


class Good(BaseModel):
    # 在设计表的时候 一定要多看数据,确保他是正确的类型
    # 然后是数字就存数字,不要存字符串
    id = IntegerField(primary_key=True, verbose_name="商品id")
    name = CharField(max_length=10, verbose_name="商品名称")
    content = TextField(default="", verbose_name="商品描述")
    supplier = CharField(max_length=10, verbose_name="供应商")
    ggbz = TextField(default="", verbose_name="规格包装")
    image_list = TextField(default="", verbose_name="介绍图")
    price = FloatField(default=0.0, verbose_name="价格")
    good_rate = IntegerField(default=0, verbose_name="好评率")
    comment_nums = IntegerField(default=0, verbose_name="评论数")
    image_comment_nums = IntegerField(default=0, verbose_name="晒图数")
    video_comment_nums = IntegerField(default=0, verbose_name="视频数")
    add_comment_nums = IntegerField(default=0, verbose_name="追评数")
    good_comment_nums = IntegerField(default=0, verbose_name="好评数")
    mid_comment_nums = IntegerField(default=0, verbose_name="中评数")
    bad_comment_nums = IntegerField(default=0, verbose_name="差评数")




class GoodEvaluateSummary(BaseModel):
    good = ForeignKeyField(Good, verbose_name="商品")
    tag = CharField(max_length=10, verbose_name="标签")
    num = IntegerField(default=0, verbose_name="数量")



class GoodEvaluate(BaseModel):
    id = CharField(primary_key=True)
    good = ForeignKeyField(Good, verbose_name="商品")
    user_head_url = CharField(verbose_name="用户头像")
    user_name = CharField(verbose_name="用户名")
    good_info = CharField(max_length=10, verbose_name="当前评论的商品信息")
    evaluate_time = DateTimeField(verbose_name="评价时间")
    content = TextField(default="", verbose_name="评价内容")
    star = IntegerField(default=0, verbose_name="评分")
    comment_nums = IntegerField(default=0, verbose_name="评论数")
    like_nums = IntegerField(default=0, verbose_name="点赞数")
    image_list = TextField(default="", verbose_name="图片")
    video_list = TextField(default="", verbose_name="视频")



if __name__ == '__main__':
    db.create_tables([Good, GoodEvaluateSummary, GoodEvaluate])


然后执行主代码

[Python] 纯文本查看 复制代码
import time
from datetime import datetime

from scrapy import Selector

from models import *
import json
import re
from selenium.common.exceptions import NoSuchElementException
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

browse = webdriver.Chrome(ChromeDriverManager().install())


# 字符串转数字 也就是评价 这里自己品一下 ==>    全部评价(20万+)
def strToNum(str):
    num = 0
    obj1 = re.search("(\d+)", str)
    if obj1:
        num = int(obj1.group(1))
        if "万" in str:
            num *= 10000
    return num


def parse_good(good_id):
    # 打开网页
    browse.get("https://item.jd.com/{}.html".format(good_id))
    time.sleep(2)
    sel = Selector(text=browse.page_source)
    # 提取信息
    good = Good(good_id)
    # 这里的sleep都是等待 括号里面的是秒
    time.sleep(3)
    # 下面的都是见名知意吧  xpath去提取
    name = "".join(sel.xpath('//div[@class="sku-name"]/text()').extract()).strip()
    price = float("".join(sel.xpath('//span[@class="price J-p-{}"]/text()'.format(good_id)).extract()).strip())
    detail = "".join(sel.xpath("//div[@id='detail']//div[@class='tab-con']").extract())
    good_images = sel.xpath("//div[@id='spec-list']//img/@src").extract()
    supplier_info = "".join(sel.xpath("//div[@id='summary-service']").extract())
    # 正则去提取供货商信息 第三方、jd
    re_match = re.search('<a href="//(.*).jd.com', supplier_info)
    if re_match:
        good.supplier = re_match.group(1)
    else:
        good.supplier = "jd"
    # 赋值一下 丢到数据库里面
    good.id = good_id
    good.name = name
    good.price = price
    good.content = detail
    good.image_list = json.dumps(good_images)

    good.supplier = supplier_info
    # 点击规格和包装
    # print(browse.page_source)
    ggbz_element = browse.find_element_by_xpath("//div[@class='tab-main large']//li[contains(text(),'规格与包装')]")
    ggbz_element.click()
    time.sleep(2)
    # 每次点击需要重新调用
    sel = Selector(text=browse.page_source)
    ggbz_detail = "".join(sel.xpath('//div[@id="detail"]/div[@class="tab-con"]').extract())
    good.ggbz = ggbz_detail
    # 模拟点商品评价 获取评价信息
    time.sleep(2)
    try:
        click_ele = browse.find_element_by_xpath("//li[@clstag='shangpin|keycount|product|shangpinpingjia_1']")
        click_ele.click()
    except NoSuchElementException as e:
        pass
    sel = Selector(text=browse.page_source)
    time.sleep(5)

    tag_list = sel.xpath("//div[@class='tag-list tag-available']//span/text()").extract()
    time.sleep(5)

    good_rate = int(sel.xpath("//div[@class='percent-con']/text()").extract()[0])
    good.good_rate = good_rate
    sum_comment = sel.xpath('//ul[@class="filter-list"]/li/a')
    for s in sum_comment:
        # 筛选一下放入数据库
        name = s.xpath('./text()').extract()[0]
        nums = s.xpath('./em/text()').extract()[0]
        nums = strToNum(nums)
        if name == "全部评价":
            good.comment_nums = nums
        elif name == "晒图":
            good.image_comment_nums = nums
        elif name == "视频晒单":
            good.video_comment_nums = nums
        elif name == "追评":
            good.add_comment_nums = nums
        elif name == "好评":
            good.good_comment_nums = nums
        elif name == "中评":
            good.mid_comment_nums = nums
        elif name == "差评":
            good.bad_comment_nums = nums

    # 查找是否存在
    existed_good = Good.select().where(Good.id == good.id)
    if existed_good:
        good.save()
    else:
        good.save(force_insert=True)

    for tag in tag_list:
        obj = re.match("(.*)\((\d+)\)", tag)
        if obj:
            # 清晰度高(227) 流畅至极(208) 充电快速(185) 颜色绚丽(118) 色彩饱满(96) 性能一流(88) 舒适性强(76) 悦耳动听(29) 像素一流(16) 配置超棒(12)
            # 提取一下字符串和数字
            tag_name = obj.group(1)
            nums = int(obj.group(2))
            existed_sum = GoodEvaluateSummary.select().where(GoodEvaluateSummary.good == good,
                                                             GoodEvaluateSummary.tag == tag_name)
            if existed_sum:
                summary = existed_sum[0]
            else:
                summary = GoodEvaluateSummary(good=good)
            summary.tag = tag_name
            summary.num = nums
            summary.save()

    # 评价
    has_next_page = True
    while has_next_page:
        all_evalutes = sel.xpath("//div[@class='comment-item']")
        for item in all_evalutes:
            good_evaluate = GoodEvaluate(good=good)

            evaluate_id = item.xpath('./@data-guid').extract()[0]
            good_evaluate.id = evaluate_id

            user_head_url = item.xpath(".//div[@class='user-info']//img/@src").extract()[0]
            user_name = "".join(item.xpath(".//div[@class='user-info']/text()").extract()).strip()
            good_evaluate.user_head_url = user_head_url
            good_evaluate.user_name = user_name

            stat = item.xpath('./div[2]/div[1]/@class').extract()[0]
            stat = int(stat[-1])
            good_evaluate.star = stat

            content = "".join(item.xpath('./div[2]/p[1]/text()').extract()[0].strip())
            good_evaluate.content = content

            image_list = item.xpath("./div[2]//div[@class='pic-list J-pic-list']/a/img/@src").extract()
            video_list = item.xpath("./div[2]//div[@class='J-video-view-wrap clearfix']//video/@src").extract()

            good_evaluate.image_list = json.dumps(image_list)
            good_evaluate.video_list = json.dumps(video_list)

            like_nums = int(item.xpath(".//div[@class='comment-op']/a[2]/text()").extract()[0])
            comment_num = int(item.xpath(".//div[@class='comment-op']/a[3]/text()").extract()[0])
            good_evaluate.like_nums = like_nums
            good_evaluate.comment_nums = comment_num
            good.id = good_id

            comment_info = item.xpath(".//div[@class='order-info']/span/text()").extract()
            oder_info = comment_info[:-1]
            good_evaluate.good_info = json.dumps(oder_info)

            evaluate_time = comment_info[-1]
            evaluate_time = datetime.strptime(evaluate_time, "%Y-%m-%d %H:%M")
            good_evaluate.evaluate_time = evaluate_time

            # existed_good_evaluate = GoodEvaluate.select().where(GoodEvaluate.id == good_evaluate.id)
            # if existed_good_evaluate:
            #     good_evaluate.save()
            # else:
            good_evaluate.save(force_insert=True)

        # 获取下一页
        try:
            next_page_text = browse.find_element_by_xpath('//div[@id="comment"]//a[@class="ui-pager-next"]')
            next_page_text.send_keys("\n")
            time.sleep(3)
        except NoSuchElementException as e:
            sel = Selector(text=browse.page_source)
            has_next_page = False


if __name__ == '__main__':
    parse_good(100009956275)
                    

爬取京东.png (288.36 KB, 下载次数: 20)

爬取京东.png

免费评分

参与人数 9吾爱币 +14 热心值 +6 收起 理由
wuai130113 + 1 + 1 谢谢@Thanks!
yihufengyue + 1 + 1 谢谢@Thanks!
hwh425 + 1 热心回复!
dghg + 1 欢迎分析讨论交流,吾爱破解论坛有你更精彩!
安道尔的鱼 + 1 + 1 热心回复!
1241929750 + 1 感谢发布原创作品,吾爱破解论坛因你更精彩!
z406799037 + 1 谢谢@Thanks!
苏紫方璇 + 7 + 1 欢迎分析讨论交流,吾爱破解论坛有你更精彩!
hshcompass + 1 + 1 厉害。收藏学习。

查看全部评分

发帖前要善用论坛搜索功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。

推荐
ruicky 发表于 2021-12-7 13:43
放到本地跑,发现有报错,不知道楼主有吗?

报错行代码

good_rate = int(sel.xpath("//div[@class='percent-con']/text()").extract()[0])


报错内容:
Traceback (most recent call last):
  File "index.py", line 185, in <module>
    parse_good(100009956275)
  File "index.py", line 79, in parse_good
    good_rate = int(sel.xpath("//div[@class='percent-con']/text()").extract()[0])
IndexError: list index out of range
推荐
wuai130113 发表于 2022-4-8 20:08
报错了 楼主啊  第二篇代码
Traceback (most recent call last):
  File "jd2.py", line 184, in <module>
    parse_good(100009956275)
  File "jd2.py", line 33, in parse_good
    good = Good(good_id)
NameError: name 'Good' is not defined
沙发
hshcompass 发表于 2021-6-19 22:09
3#
tek2y 发表于 2021-6-19 22:12
感谢分享经验
4#
三滑稽甲苯 发表于 2021-6-19 22:31
有没有爬取京东物流的爬虫
5#
herry_heng 发表于 2021-6-19 23:08
感谢分享!
6#
璐璐诺 发表于 2021-6-19 23:09
过来学习学习代码
7#
AsuraSong 发表于 2021-6-19 23:12
谢谢楼主分享,这思路好
8#
CCQc 发表于 2021-6-20 07:40
学习爬取思路,具体的也可以淘宝吗?
9#
wpwpwp 发表于 2021-6-20 07:48
看着很厉害,收藏
10#
 楼主| QingYi. 发表于 2021-6-20 11:04 |楼主
CCQc 发表于 2021-6-20 07:40
学习爬取思路,具体的也可以淘宝吗?

没试过 下次有空可以试试
您需要登录后才可以回帖 登录 | 注册[Register]

本版积分规则

返回列表

RSS订阅|小黑屋|处罚记录|联系我们|吾爱破解 - LCG - LSG ( 京ICP备16042023号 | 京公网安备 11010502030087号 )

GMT+8, 2024-11-24 23:50

Powered by Discuz!

Copyright © 2001-2020, Tencent Cloud.

快速回复 返回顶部 返回列表