好友
阅读权限10
听众
最后登录1970-1-1
|
目前我已经学完了python的爬虫,scrapy,scrapy-redis也摸透了,基本的网页没啥大问题,逆向js实在是看不懂,
我认为现在我进步的方向是逆向js和代码的优化
1.逆向js求教程
2.大佬们指点一下我代码优化的地方
[Python] 纯文本查看 复制代码 # -*- coding: utf-8 -*-
import scrapy
from copy import deepcopy
from ..items import SuningbookItem
import re
import json
import time
json_ =re.compile("""shopScoreCallback((.*?))""")
BOOK_NAME=re.compile("《(.*?)》.*?")
Price = re.compile('"itemPrice":"(.*?)",')
Shop = re.compile('"shopName":"(.*?)",')
Publish = re.compile('"brandName":"(.*?)",')
Writer = re.compile("""<li>作者:
<span>(.*?)</span>""")
Date = re.compile("""<li>出版时间:(.*?)</li>""")
Phone = re.compile("""<dt>电话:</dt>
<dd>
<p>(.*?)</p>""")
class SnbookSpider(scrapy.Spider):
name = 'snbook'
allowed_domains = ['suning.com']
start_urls = ['https://book.suning.com']
def parse(self, response):
item = SuningbookItem()
menu_list = response.xpath("//div[@class='menu-list']/div[@class='menu-item']")
sub_list = response.xpath("//div[@class='menu-list']/div[@class='menu-sub']")
for index,menu in enumerate(menu_list):
item["b_label"] = menu.xpath(".//dt//a/text()").extract_first()
sub = sub_list[index].xpath("./div[@class='submenu-left']/p")
for m in sub:
item["m_label"] = m.xpath("./a//text()").extract_first()
s_list = m.xpath("./following-sibling::ul[1]/li")
for s in s_list:
item["s_label"] = s.xpath('./a[re:test(@href,"https://list.suning.com/\d-\d+-\d.html")]/text()').extract_first()
# list_url = s.xpath('./a[contains(@href,"html")]/@href').extract_first()
li_url = s.xpath('./a[re:test(@href,"https://list.suning.com/\d-\d+-\d.html")]/@href').extract_first()
if li_url is not None:
yield scrapy.Request(url=li_url,callback=self.parse_list,meta={"item":deepcopy(item)})
# 一共100页
ci = li_url.split("-")[1]
for i in range(1, 100):
next_url = f"https://list.suning.com/1-{ci}-{str(i)}.html"
next_part = f"https://list.suning.com/emall/showProductList.do?ci={ci}&pg=03&cp={str(i)}&il=0&iy=0&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAABC&id=IDENTIFYING&cc=510&paging=1&sub=0"
yield scrapy.Request(url=next_url, callback=self.parse_list, meta={"item": deepcopy(item)})
time.sleep(1)
yield scrapy.Request(url=next_part, callback=self.parse_list, meta={"item": deepcopy(item)})
def parse_list(self,response):
item = response.meta["item"]
# print(response.url)
li_list = response.xpath("//ul[@class='clearfix']/li" or "//li")
for li in li_list:
detail_url= li.xpath(".//div[@class='res-info']/p[@class='sell-point']/a/@href").extract_first()
if detail_url is not None:
detail_url = "http:"+detail_url
time.sleep(1)
yield scrapy.Request(url=detail_url,callback=self.pares_book,meta={"item":deepcopy(item)})
def pares_book(self,response):
item = response.meta["item"]
item["book_name"] = "".join(response.css("title::text").re(BOOK_NAME))
main_info = response.xpath("//script[@type='text/javascript']/text()")
item["book_price"] = main_info.re(Price)[0] if len(main_info.re(Price)) > 0 else None
item["book_shop"] = main_info.re(Shop)[0] if len(main_info.re(Shop)) > 0 else None
item["book_publish"] = main_info.re(Publish)[0] if len(main_info.re(Publish)) > 0 else None
item["book_date"] = response.xpath("//div").re(Date)[0] if len(response.xpath("//div").re(Date)) > 0 else None
writer =response.xpath("//div").re(Writer)[0] if len(response.xpath("//div").re(Writer)) > 0 else None
item["book_writer"] = re.sub("(著)|(著作)|(者著)","",writer) if writer is not None else None
item["book_phone"] = response.xpath("//div").re(Phone)[0] if len(response.xpath("//div").re(Phone)) > 0 else None
# 店家页的拼接
num = response.url.split("00")[1].split("/")[0]
if num:
shop_url = f"http://product.suning.com/pds-web/ajax/getApiRemoteMap_{num}_shopScoreCallback.html?callback=shopScoreCallback"
yield scrapy.Request(url=shop_url,callback=self.parse_shop,meta={"item":deepcopy(item)})
def parse_shop(self,response):
item = response.meta["item"]
content = re.findall('parentIndexScore.":."(.*?).",',response.body.decode())
print(response.body.decode())
print(type(response.body.decode()))
if len(content)==3:
item["book_remark"] = content[0]
item["Logistics_score"] = content[1]
item["after_sell"] = content[2]
yield item
|
|
发帖前要善用【论坛搜索】功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。 |
|
|
|
|