需要用到的东西有 PyMySQL : https://github.com/PyMySQL/PyMySQL
还有peewee :https://github.com/coleifer/peewee
上面的内容需要自行安装,都有安装文档,极其简单。
然后先要运行这段代码,去在数据库中生成表
数据是什么意思都写在里面了
[Python] 纯文本查看 复制代码 from peewee import *
db = MySQLDatabase('jd', host='localhost', port=3306, user="root", password="sa")
class BaseModel(Model):
class Meta:
database = db
'''
char has set the max length
can set Text if can't ensure the length
'''
class Good(BaseModel):
# 在设计表的时候 一定要多看数据,确保他是正确的类型
# 然后是数字就存数字,不要存字符串
id = IntegerField(primary_key=True, verbose_name="商品id")
name = CharField(max_length=10, verbose_name="商品名称")
content = TextField(default="", verbose_name="商品描述")
supplier = CharField(max_length=10, verbose_name="供应商")
ggbz = TextField(default="", verbose_name="规格包装")
image_list = TextField(default="", verbose_name="介绍图")
price = FloatField(default=0.0, verbose_name="价格")
good_rate = IntegerField(default=0, verbose_name="好评率")
comment_nums = IntegerField(default=0, verbose_name="评论数")
image_comment_nums = IntegerField(default=0, verbose_name="晒图数")
video_comment_nums = IntegerField(default=0, verbose_name="视频数")
add_comment_nums = IntegerField(default=0, verbose_name="追评数")
good_comment_nums = IntegerField(default=0, verbose_name="好评数")
mid_comment_nums = IntegerField(default=0, verbose_name="中评数")
bad_comment_nums = IntegerField(default=0, verbose_name="差评数")
class GoodEvaluateSummary(BaseModel):
good = ForeignKeyField(Good, verbose_name="商品")
tag = CharField(max_length=10, verbose_name="标签")
num = IntegerField(default=0, verbose_name="数量")
class GoodEvaluate(BaseModel):
id = CharField(primary_key=True)
good = ForeignKeyField(Good, verbose_name="商品")
user_head_url = CharField(verbose_name="用户头像")
user_name = CharField(verbose_name="用户名")
good_info = CharField(max_length=10, verbose_name="当前评论的商品信息")
evaluate_time = DateTimeField(verbose_name="评价时间")
content = TextField(default="", verbose_name="评价内容")
star = IntegerField(default=0, verbose_name="评分")
comment_nums = IntegerField(default=0, verbose_name="评论数")
like_nums = IntegerField(default=0, verbose_name="点赞数")
image_list = TextField(default="", verbose_name="图片")
video_list = TextField(default="", verbose_name="视频")
if __name__ == '__main__':
db.create_tables([Good, GoodEvaluateSummary, GoodEvaluate])
然后执行主代码
[Python] 纯文本查看 复制代码 import time
from datetime import datetime
from scrapy import Selector
from models import *
import json
import re
from selenium.common.exceptions import NoSuchElementException
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
browse = webdriver.Chrome(ChromeDriverManager().install())
# 字符串转数字 也就是评价 这里自己品一下 ==> 全部评价(20万+)
def strToNum(str):
num = 0
obj1 = re.search("(\d+)", str)
if obj1:
num = int(obj1.group(1))
if "万" in str:
num *= 10000
return num
def parse_good(good_id):
# 打开网页
browse.get("https://item.jd.com/{}.html".format(good_id))
time.sleep(2)
sel = Selector(text=browse.page_source)
# 提取信息
good = Good(good_id)
# 这里的sleep都是等待 括号里面的是秒
time.sleep(3)
# 下面的都是见名知意吧 xpath去提取
name = "".join(sel.xpath('//div[@class="sku-name"]/text()').extract()).strip()
price = float("".join(sel.xpath('//span[@class="price J-p-{}"]/text()'.format(good_id)).extract()).strip())
detail = "".join(sel.xpath("//div[@id='detail']//div[@class='tab-con']").extract())
good_images = sel.xpath("//div[@id='spec-list']//img/@src").extract()
supplier_info = "".join(sel.xpath("//div[@id='summary-service']").extract())
# 正则去提取供货商信息 第三方、jd
re_match = re.search('<a href="//(.*).jd.com', supplier_info)
if re_match:
good.supplier = re_match.group(1)
else:
good.supplier = "jd"
# 赋值一下 丢到数据库里面
good.id = good_id
good.name = name
good.price = price
good.content = detail
good.image_list = json.dumps(good_images)
good.supplier = supplier_info
# 点击规格和包装
# print(browse.page_source)
ggbz_element = browse.find_element_by_xpath("//div[@class='tab-main large']//li[contains(text(),'规格与包装')]")
ggbz_element.click()
time.sleep(2)
# 每次点击需要重新调用
sel = Selector(text=browse.page_source)
ggbz_detail = "".join(sel.xpath('//div[@id="detail"]/div[@class="tab-con"]').extract())
good.ggbz = ggbz_detail
# 模拟点商品评价 获取评价信息
time.sleep(2)
try:
click_ele = browse.find_element_by_xpath("//li[@clstag='shangpin|keycount|product|shangpinpingjia_1']")
click_ele.click()
except NoSuchElementException as e:
pass
sel = Selector(text=browse.page_source)
time.sleep(5)
tag_list = sel.xpath("//div[@class='tag-list tag-available']//span/text()").extract()
time.sleep(5)
good_rate = int(sel.xpath("//div[@class='percent-con']/text()").extract()[0])
good.good_rate = good_rate
sum_comment = sel.xpath('//ul[@class="filter-list"]/li/a')
for s in sum_comment:
# 筛选一下放入数据库
name = s.xpath('./text()').extract()[0]
nums = s.xpath('./em/text()').extract()[0]
nums = strToNum(nums)
if name == "全部评价":
good.comment_nums = nums
elif name == "晒图":
good.image_comment_nums = nums
elif name == "视频晒单":
good.video_comment_nums = nums
elif name == "追评":
good.add_comment_nums = nums
elif name == "好评":
good.good_comment_nums = nums
elif name == "中评":
good.mid_comment_nums = nums
elif name == "差评":
good.bad_comment_nums = nums
# 查找是否存在
existed_good = Good.select().where(Good.id == good.id)
if existed_good:
good.save()
else:
good.save(force_insert=True)
for tag in tag_list:
obj = re.match("(.*)\((\d+)\)", tag)
if obj:
# 清晰度高(227) 流畅至极(208) 充电快速(185) 颜色绚丽(118) 色彩饱满(96) 性能一流(88) 舒适性强(76) 悦耳动听(29) 像素一流(16) 配置超棒(12)
# 提取一下字符串和数字
tag_name = obj.group(1)
nums = int(obj.group(2))
existed_sum = GoodEvaluateSummary.select().where(GoodEvaluateSummary.good == good,
GoodEvaluateSummary.tag == tag_name)
if existed_sum:
summary = existed_sum[0]
else:
summary = GoodEvaluateSummary(good=good)
summary.tag = tag_name
summary.num = nums
summary.save()
# 评价
has_next_page = True
while has_next_page:
all_evalutes = sel.xpath("//div[@class='comment-item']")
for item in all_evalutes:
good_evaluate = GoodEvaluate(good=good)
evaluate_id = item.xpath('./@data-guid').extract()[0]
good_evaluate.id = evaluate_id
user_head_url = item.xpath(".//div[@class='user-info']//img/@src").extract()[0]
user_name = "".join(item.xpath(".//div[@class='user-info']/text()").extract()).strip()
good_evaluate.user_head_url = user_head_url
good_evaluate.user_name = user_name
stat = item.xpath('./div[2]/div[1]/@class').extract()[0]
stat = int(stat[-1])
good_evaluate.star = stat
content = "".join(item.xpath('./div[2]/p[1]/text()').extract()[0].strip())
good_evaluate.content = content
image_list = item.xpath("./div[2]//div[@class='pic-list J-pic-list']/a/img/@src").extract()
video_list = item.xpath("./div[2]//div[@class='J-video-view-wrap clearfix']//video/@src").extract()
good_evaluate.image_list = json.dumps(image_list)
good_evaluate.video_list = json.dumps(video_list)
like_nums = int(item.xpath(".//div[@class='comment-op']/a[2]/text()").extract()[0])
comment_num = int(item.xpath(".//div[@class='comment-op']/a[3]/text()").extract()[0])
good_evaluate.like_nums = like_nums
good_evaluate.comment_nums = comment_num
good.id = good_id
comment_info = item.xpath(".//div[@class='order-info']/span/text()").extract()
oder_info = comment_info[:-1]
good_evaluate.good_info = json.dumps(oder_info)
evaluate_time = comment_info[-1]
evaluate_time = datetime.strptime(evaluate_time, "%Y-%m-%d %H:%M")
good_evaluate.evaluate_time = evaluate_time
# existed_good_evaluate = GoodEvaluate.select().where(GoodEvaluate.id == good_evaluate.id)
# if existed_good_evaluate:
# good_evaluate.save()
# else:
good_evaluate.save(force_insert=True)
# 获取下一页
try:
next_page_text = browse.find_element_by_xpath('//div[@id="comment"]//a[@class="ui-pager-next"]')
next_page_text.send_keys("\n")
time.sleep(3)
except NoSuchElementException as e:
sel = Selector(text=browse.page_source)
has_next_page = False
if __name__ == '__main__':
parse_good(100009956275)
|