学习期间写了一个获取某宝的商品评论信息
本文只供学习使用!!!```
"""
:author: zgq
:describe: 预计输出两个文件
1,一个商品信息文件,存储商品的详细数据
2,二个评论存储文件,存储每个商品的评论条数
两个文件使用商品id进行关联
"""
import requests as req
from fake_useragent import UserAgent
import json
import re
import time
# 取消https的证书警告
req.packages.urllib3.disable_warnings()
"""
配置文件
"""
# 日志文件存储路径
LOGADDRESS = "log.txt"
# 商品信息文件存储路径
PHONEINFOADDRESS = "phoneinfo.csv"
# 商品评论信息文件存储路径
COMMENTSADDRESS = "comments.csv"
# 获取的商品关键字
PHONEKEYWORDS = [
{
"keyword": "手机",
"initpage": 0,
"endpage": 100
}, {
"keyword": "ipad",
"initpage": 0,
"endpage": 100
}, {
"keyword": "电脑",
"initpage": 0,
"endpage": 100
}, {
"keyword": "笔记本",
"initpage": 0,
"endpage": 100
}
]
logfp = open(LOGADDRESS, "a", encoding="utf-8")
def log(text):
"""
日志方法
:param text: 需要输出的日志信息
:return:
"""
info = "[%s]:%s\n" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), text)
logfp.write(info)
print(info)
logfp.flush()
class SpiderTaobao(object):
"""
爬取淘宝的主要对象
"""
def __init__(self, keywords, initpage, endpage):
# 初始地址
self.url = "https://s.taobao.com/search?q={}&s={}"
# 关键字
self.keywords = keywords
# 截至页面 需要爬取多少页
self.endPage = endpage
# 初始页面
self.initPage = initpage
self.phoneInfoFp = open(PHONEINFOADDRESS, "a", encoding="utf-8")
self.phoneCommentsFp = open(COMMENTSADDRESS, "a", encoding="utf-8")
self.ua = UserAgent()
def writePhoneInfo(self, lines):
"""
将商品信息数据写入文件中
# id,title,price,address,salesNum,storeName,commentUrl
:param lines: 每一行的数据
:return: 无返回值
"""
for line in range(0, len(lines)):
if len(lines)-1 != line:
self.phoneInfoFp.write(lines + ",")
else:
self.phoneInfoFp.write(lines+"\n")
self.phoneInfoFp.flush()
def writePhoneComment(self, lines):
"""
商品评论信息数据写入文件中
:param lines: 每一行的数据
:return: 无返回值
"""
for line in range(0, len(lines)):
if len(lines)-1 != line:
self.phoneCommentsFp.write(lines + ",")
else:
self.phoneCommentsFp.write(lines+"\n")
self.phoneCommentsFp.flush()
def parse_getPhone(self, html):
"""
获取商品页面的商品详细信息
:param html: 商品页面的html
:return:四十四个商品的id,商品的评论页面连接
"""
log("正在解析商品数据...")
info = json.loads(re.compile("g_page_config = (.*?)};").findall(html) + "}")
# id,title,price,address,salesNum,storeId,storeName,commentUrl
# 'uid', 'title', 'view_price', 'item_loc','view_sales', 'user_id', 'nick', 'comment_url'
phoneReturn = list()
for phoneInfo in info["mods"]["itemlist"]["data"]["auctions"]:
if "http" in phoneInfo["comment_url"]:
continue
poList = list()
pr = dict()
poList.append(phoneInfo["nid"])
poList.append(phoneInfo["title"])
poList.append(phoneInfo["view_price"])
poList.append(phoneInfo["item_loc"])
poList.append(phoneInfo["view_sales"])
poList.append(phoneInfo["user_id"])
poList.append(phoneInfo["nick"])
poList.append(phoneInfo["comment_url"])
gradeAvg = self.get_gradeAvg(phoneInfo["nid"], phoneInfo["user_id"], phoneInfo["comment_url"])
poList.append(str(gradeAvg))
# 将po list 写入phoneinfo
self.writePhoneInfo(poList)
pr["id"] = phoneInfo["nid"]
pr["storeUrl"] = phoneInfo["user_id"]
pr["commentUrl"] = phoneInfo["comment_url"]
phoneReturn.append(pr)
return phoneReturn
def get_PhonePage(self, page):
"""
获取商品页面的网页信息
:param page: 要爬取的页面 page/44 为第几页
:return: 返回的是当前页面的html
"""
return req.get(self.url.format(self.keywords, page), headers={
"User-Agent": self.ua.chrome,
"Cookie": "自己去网页中将cookie复制过来即可"
}).text
def get_gradeAvg(self, itemid, storeid, referer):
"""
获取总评分
https://dsr-rate.tmall.com/list_dsr_info.htm
?itemId=616969094580商品id
&sellerId=1776456424 商店id
&groupId
:param itemid: 商品id
:param storeid: 店铺id
:param referer: 评论url
:return:
"""
url = "https://dsr-rate.tmall.com/list_dsr_info.htm?"
params = {
"itemId": itemid,
"sellerId": storeid
}
response = req.get(url, params=params, headers={
"UserAgent": self.ua.chrome,
"Cookie": "自己去网页中将cookie复制过来即可",
"Referer": "https:" + referer
}, verify=False)
grade = json.loads(re.compile(r'jsonp\d+\((.*?)\)').findall(response.text))["dsr"]["gradeAvg"]
return grade
def getComments(self, itemid, storeid, referer, page):
"""
根据id和storeid 爬取商品的评论信息
:param itemid: 商品id
:param storeid: 店铺id
:param referer: 评论页面url
:param page: 页面编号
:return:
"""
url = "https://rate.tmall.com/list_detail_rate.htm?"
params = {
"itemId": str(itemid),
"sellerId": str(storeid),
"order": "3",
"currentPage": str(page),
"append": "0",
"content": "1",
"tagId": "",
"posi": "",
"picture": "",
"groupId": "",
"ua": "098%23E1hvMQvRvBwvUpCkvvvvvjiPnL5ygjYWPFFwtjD2PmPp6jtnRsSZljEmP25yAjYWRLyCvvpvvhCv9phvHnsGvHq%2FzYswzWU37%2FwZzb2w4xiIdphvhIpmj1rzvvmcKfhSpqoxP2It%2B8wCvvpvvhHh2QhvCvvvvvvCvpvVvUCvpvvvKphv8vvvpHgvvvvvvvChDQvv9ayvvhNjvvvmjvvvBGwvvvUnvvCj1Qvvv90ivpvUvvCCWeEpHHAEvpvVvpCmp%2F2pmphvLvHjvphaT2eARdIAcUmxdBkK5kx%2FsjZ7%2Bu0XjomxfBkKHdUf85c6%2Bu0Ode%2BRfwAKHd8rwAq6k28AR293ZY0tKuGt%2BFXNAXhGWT%2FSOygtvpvhvvCvpUwCvvpv9hCvdphvmpmCC9CevvmX346CvCoH9RPeJvvvV%2B6DjlzE3LwAI2I6B9%3D%3D",
"needFold": "0",
}
response = req.get(url, params=params, headers={
"UserAgent": self.ua.chrome,
"Cookie": "自己去网页中将cookie复制过来即可",
"Referer": "https://detail.tmall.com/item.htm?spm=a230r.1.14.14.15586fbeh2UH2Z&id=616969094580&cm_id=140105335569ed55e27b&abbucket=8&sku_properties=5919063:6536025;12304035:1905146457;122216431:27772 "
}, verify=False)
return json.loads(re.compile(r'jsonp\d+\((.*?)}\)').findall(response.text)+"}")
def main(self):
"""
主调函数
:return: 无返回值
"""
#获取每个页面的商品信息
for pages in range(self.initPage, self.endPage):
try:
log("正在爬取第%d页" % pages)
# 每个页面都是44个信息 第一页 0 第二页 44 ... 88
# 获取的是商品的id ,storeUrl, commentsUrl
phoneCommentsInfo = self.parse_getPhone(self.get_PhonePage(pages * 44))
for phoneComment in phoneCommentsInfo:
log("正在爬取第%d页id 为%s的商品" % (pages, phoneComment["id"]))
# 评论分页的初始页
commentPage = 1
commentFlag = True
while commentFlag:
cmt = self.getComments(phoneComment["id"],
phoneComment["storeUrl"],
phoneComment["commentUrl"],
commentPage)
allPages = int(cmt["rateDetail"]["paginator"]["lastPage"])
log("商品【%s】的评论总页数为%d" % (phoneComment["id"], allPages))
commentFlag = allPages >= commentPage
log("爬取并存储商品id为【%s】-第%d页评论" % (phoneComment["id"], commentPage))
# 存储评论
for commentOne in cmt["rateDetail"]["rateList"]:
commentLines = list()
commentLines.append(phoneComment["id"])
commentLines.append(commentOne["auctionSku"])
commentLines.append(commentOne["cmsSource"])
commentLines.append(commentOne["rateDate"])
commentLines.append(commentOne["rateContent"])
commentLines.append(commentOne["displayUserNick"])
self.writePhoneComment(commentLines)
commentPage = commentPage + 1
time.sleep(3)
except Exception as e:
log("出现异常,执行回调,跳到下一页")
log(e)
continue
if __name__ == '__main__':
log("程序开始...")
for key in PHONEKEYWORDS:
log("爬取关键字为【%s】的页面" % key["keyword"])
spider = SpiderTaobao(key["keyword"], key["initpage"], key["endpage"])
spider.main()
log("程序正常结束...")
```
本文只供学习使用!!! 没试过淘宝,我爬过阿里巴巴,抓包抓半天,后来生气,把网站保存下来爬(因为我不会用selenuim)还真能爬,然后又抓包,只能抓到排名20名以后的,然后就一直保存下来使用,反正页面也不多。然后爬下来用电子表格分割关键词排名,抓取企业名称,排名,诚信通年限,标题,然后这样用了一段时间我就跳槽了,也没完善 神经病阿 发表于 2020-8-6 15:30
咋用阿大哥,直接放到Pycharm行吗,但是模块有问题
自己解决一下报错,或者把requests库,fake_useragent库安装一下,这两个是第三方库,需要自己安装。pip install requests,pip install fake_useragent 不会用,喜欢成品{:1_899:} python写爬虫无敌了 咋用阿大哥,直接放到Pycharm行吗,但是模块有问题 这个代码具体啥用阿大哥 对应关联类似商品的评论? 小图 发表于 2020-8-6 15:12
不会用,喜欢成品
这就是成品啊,自己替换cookie,然后设置要爬取的商品类目和页数就行 谢谢分享 感谢楼主{:1_893:}加上自己的cookie就可以完美运行!(共计三处) cookie为啥不在一个地方写,然后统一调用。现在这样要写三个地方{:1_926:}
页:
[1]
2