"""
:author: zgq
:describe: 预计输出两个文件
1,一个商品信息文件,存储商品的详细数据
2,二个评论存储文件,存储每个商品的评论条数
两个文件使用商品id进行关联
"""
import requests as req
from fake_useragent import UserAgent
import json
import re
import time
# 取消https的证书警告
req.packages.urllib3.disable_warnings()
"""
配置文件
"""
# 日志文件存储路径
LOGADDRESS = "log.txt"
# 商品信息文件存储路径
PHONEINFOADDRESS = "phoneinfo.csv"
# 商品评论信息文件存储路径
COMMENTSADDRESS = "comments.csv"
# 获取的商品关键字
PHONEKEYWORDS = [
{
"keyword": "手机",
"initpage": 0,
"endpage": 100
}, {
"keyword": "ipad",
"initpage": 0,
"endpage": 100
}, {
"keyword": "电脑",
"initpage": 0,
"endpage": 100
}, {
"keyword": "笔记本",
"initpage": 0,
"endpage": 100
}
]
logfp = open(LOGADDRESS, "a", encoding="utf-8")
def log(text):
"""
日志方法
:param text: 需要输出的日志信息
:return:
"""
info = "[%s]:%s\n" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), text)
logfp.write(info)
print(info)
logfp.flush()
class SpiderTaobao(object):
"""
爬取淘宝的主要对象
"""
def __init__(self, keywords, initpage, endpage):
# 初始地址
self.url = "https://s.taobao.com/search?q={}&s={}"
# 关键字
self.keywords = keywords
# 截至页面 需要爬取多少页
self.endPage = endpage
# 初始页面
self.initPage = initpage
self.phoneInfoFp = open(PHONEINFOADDRESS, "a", encoding="utf-8")
self.phoneCommentsFp = open(COMMENTSADDRESS, "a", encoding="utf-8")
self.ua = UserAgent()
def writePhoneInfo(self, lines):
"""
将商品信息数据写入文件中
# id,title,price,address,salesNum,storeName,commentUrl
:param lines: 每一行的数据
:return: 无返回值
"""
for line in range(0, len(lines)):
if len(lines)-1 != line:
self.phoneInfoFp.write(lines[line] + ",")
else:
self.phoneInfoFp.write(lines[line]+"\n")
self.phoneInfoFp.flush()
def writePhoneComment(self, lines):
"""
商品评论信息数据写入文件中
:param lines: 每一行的数据
:return: 无返回值
"""
for line in range(0, len(lines)):
if len(lines)-1 != line:
self.phoneCommentsFp.write(lines[line] + ",")
else:
self.phoneCommentsFp.write(lines[line]+"\n")
self.phoneCommentsFp.flush()
def parse_getPhone(self, html):
"""
获取商品页面的商品详细信息
:param html: 商品页面的html
:return: 四十四个 商品的id,商品的评论页面连接
"""
log("正在解析商品数据...")
info = json.loads(re.compile("g_page_config = (.*?)};").findall(html)[0] + "}")
# id,title,price,address,salesNum,storeId,storeName,commentUrl
# 'uid', 'title', 'view_price', 'item_loc','view_sales', 'user_id', 'nick', 'comment_url'
phoneReturn = list()
for phoneInfo in info["mods"]["itemlist"]["data"]["auctions"]:
if "http" in phoneInfo["comment_url"]:
continue
poList = list()
pr = dict()
poList.append(phoneInfo["nid"])
poList.append(phoneInfo["title"])
poList.append(phoneInfo["view_price"])
poList.append(phoneInfo["item_loc"])
poList.append(phoneInfo["view_sales"])
poList.append(phoneInfo["user_id"])
poList.append(phoneInfo["nick"])
poList.append(phoneInfo["comment_url"])
gradeAvg = self.get_gradeAvg(phoneInfo["nid"], phoneInfo["user_id"], phoneInfo["comment_url"])
poList.append(str(gradeAvg))
# 将po list 写入phoneinfo
self.writePhoneInfo(poList)
pr["id"] = phoneInfo["nid"]
pr["storeUrl"] = phoneInfo["user_id"]
pr["commentUrl"] = phoneInfo["comment_url"]
phoneReturn.append(pr)
return phoneReturn
def get_PhonePage(self, page):
"""
获取商品页面的网页信息
:param page: 要爬取的页面 page/44 为第几页
:return: 返回的是当前页面的html
"""
return req.get(self.url.format(self.keywords, page), headers={
"User-Agent": self.ua.chrome,
"Cookie": "自己去网页中将cookie复制过来即可"
}).text
def get_gradeAvg(self, itemid, storeid, referer):
"""
获取总评分
https://dsr-rate.tmall.com/list_dsr_info.htm
?itemId=616969094580 商品id
&sellerId=1776456424 商店id
&groupId
:param itemid: 商品id
:param storeid: 店铺id
:param referer: 评论url
:return:
"""
url = "https://dsr-rate.tmall.com/list_dsr_info.htm?"
params = {
"itemId": itemid,
"sellerId": storeid
}
response = req.get(url, params=params, headers={
"UserAgent": self.ua.chrome,
"Cookie": "自己去网页中将cookie复制过来即可",
"Referer": "https:" + referer
}, verify=False)
grade = json.loads(re.compile(r'jsonp\d+\((.*?)\)').findall(response.text)[0])["dsr"]["gradeAvg"]
return grade
def getComments(self, itemid, storeid, referer, page):
"""
根据id和storeid 爬取商品的评论信息
:param itemid: 商品id
:param storeid: 店铺id
:param referer: 评论页面url
:param page: 页面编号
:return:
"""
url = "https://rate.tmall.com/list_detail_rate.htm?"
params = {
"itemId": str(itemid),
"sellerId": str(storeid),
"order": "3",
"currentPage": str(page),
"append": "0",
"content": "1",
"tagId": "",
"posi": "",
"picture": "",
"groupId": "",
"ua": "098%23E1hvMQvRvBwvUpCkvvvvvjiPnL5ygjYWPFFwtjD2PmPp6jtnRsSZljEmP25yAjYWRLyCvvpvvhCv9phvHnsGvHq%2FzYswzWU37%2FwZzb2w4xiIdphvhIpmj1rzvvmcKfhSpqoxP2It%2B8wCvvpvvhHh2QhvCvvvvvvCvpvVvUCvpvvvKphv8vvvpHgvvvvvvvChDQvv9ayvvhNjvvvmjvvvBGwvvvUnvvCj1Qvvv90ivpvUvvCCWeEpHHAEvpvVvpCmp%2F2pmphvLvHjvphaT2eARdIAcUmxdBkK5kx%2FsjZ7%2Bu0XjomxfBkKHdUf85c6%2Bu0Ode%2BRfwAKHd8rwAq6k28AR293ZY0tKuGt%2BFXNAXhGWT%2FSOygtvpvhvvCvpUwCvvpv9hCvdphvmpmCC9CevvmX346CvCoH9RPeJvvvV%2B6DjlzE3LwAI2I6B9%3D%3D",
"needFold": "0",
}
response = req.get(url, params=params, headers={
"UserAgent": self.ua.chrome,
"Cookie": "自己去网页中将cookie复制过来即可",
"Referer": "https://detail.tmall.com/item.htm?spm=a230r.1.14.14.15586fbeh2UH2Z&id=616969094580&cm_id=140105335569ed55e27b&abbucket=8&sku_properties=5919063:6536025;12304035:1905146457;122216431:27772 "
}, verify=False)
return json.loads(re.compile(r'jsonp\d+\((.*?)}\)').findall(response.text)[0]+"}")
def main(self):
"""
主调函数
:return: 无返回值
"""
# 获取每个页面的商品信息
for pages in range(self.initPage, self.endPage):
try:
log("正在爬取第%d页" % pages)
# 每个页面都是44个信息 第一页 0 第二页 44 ... 88
# 获取的是商品的id ,storeUrl, commentsUrl
phoneCommentsInfo = self.parse_getPhone(self.get_PhonePage(pages * 44))
for phoneComment in phoneCommentsInfo:
log("正在爬取第%d页id 为%s的商品" % (pages, phoneComment["id"]))
# 评论分页的初始页
commentPage = 1
commentFlag = True
while commentFlag:
cmt = self.getComments(phoneComment["id"],
phoneComment["storeUrl"],
phoneComment["commentUrl"],
commentPage)
allPages = int(cmt["rateDetail"]["paginator"]["lastPage"])
log("商品【%s】的评论总页数为%d" % (phoneComment["id"], allPages))
commentFlag = allPages >= commentPage
log("爬取并存储商品id为【%s】-第%d页评论" % (phoneComment["id"], commentPage))
# 存储评论
for commentOne in cmt["rateDetail"]["rateList"]:
commentLines = list()
commentLines.append(phoneComment["id"])
commentLines.append(commentOne["auctionSku"])
commentLines.append(commentOne["cmsSource"])
commentLines.append(commentOne["rateDate"])
commentLines.append(commentOne["rateContent"])
commentLines.append(commentOne["displayUserNick"])
self.writePhoneComment(commentLines)
commentPage = commentPage + 1
time.sleep(3)
except Exception as e:
log("出现异常,执行回调,跳到下一页")
log(e)
continue
if __name__ == '__main__':
log("程序开始...")
for key in PHONEKEYWORDS:
log("爬取关键字为【%s】的页面" % key["keyword"])
spider = SpiderTaobao(key["keyword"], key["initpage"], key["endpage"])
spider.main()
log("程序正常结束...")