snowballzzk 发表于 2020-11-24 17:50

python的爬虫进阶

目前我已经学完了python的爬虫,scrapy,scrapy-redis也摸透了,基本的网页没啥大问题,逆向js实在是看不懂,
我认为现在我进步的方向是逆向js和代码的优化
1.逆向js求教程
2.大佬们指点一下我代码优化的地方
# -*- coding: utf-8 -*-
import scrapy
from copy import deepcopy
from ..items import SuningbookItem
import re
import json
import time
json_ =re.compile("""shopScoreCallback((.*?))""")
BOOK_NAME=re.compile("《(.*?)》.*?")
Price = re.compile('"itemPrice":"(.*?)",')
Shop = re.compile('"shopName":"(.*?)",')
Publish = re.compile('"brandName":"(.*?)",')
Writer = re.compile("""<li>作者:
                                                                                <span>(.*?)</span>""")
Date = re.compile("""<li>出版时间:(.*?)</li>""")
Phone = re.compile("""<dt>电话:</dt>
            <dd>
                <p>(.*?)</p>""")

class SnbookSpider(scrapy.Spider):
    name = 'snbook'
    allowed_domains = ['suning.com']
    start_urls = ['https://book.suning.com']

    def parse(self, response):
      item = SuningbookItem()
      menu_list = response.xpath("//div[@class='menu-list']/div[@class='menu-item']")
      sub_list = response.xpath("//div[@class='menu-list']/div[@class='menu-sub']")
      for index,menu in enumerate(menu_list):
            item["b_label"] = menu.xpath(".//dt//a/text()").extract_first()
            sub = sub_list.xpath("./div[@class='submenu-left']/p")
            for m in sub:
                item["m_label"] = m.xpath("./a//text()").extract_first()
                s_list = m.xpath("./following-sibling::ul/li")
                for s in s_list:
                  item["s_label"] = s.xpath('./a/text()').extract_first()
                  # list_url = s.xpath('./a/@href').extract_first()
                  li_url= s.xpath('./a/@href').extract_first()
                  if li_url is not None:
                        yield scrapy.Request(url=li_url,callback=self.parse_list,meta={"item":deepcopy(item)})
                        # 一共100页
                        ci = li_url.split("-")
                        for i in range(1, 100):
                            next_url = f"https://list.suning.com/1-{ci}-{str(i)}.html"
                            next_part = f"https://list.suning.com/emall/showProductList.do?ci={ci}&pg=03&cp={str(i)}&il=0&iy=0&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAABC&id=IDENTIFYING&cc=510&paging=1&sub=0"
                            yield scrapy.Request(url=next_url, callback=self.parse_list, meta={"item": deepcopy(item)})
                            time.sleep(1)
                            yield scrapy.Request(url=next_part, callback=self.parse_list, meta={"item": deepcopy(item)})

    def parse_list(self,response):
      item = response.meta["item"]
      # print(response.url)
      li_list = response.xpath("//ul[@class='clearfix']/li" or "//li")
      for li in li_list:
            detail_url= li.xpath(".//div[@class='res-info']/p[@class='sell-point']/a/@href").extract_first()
            if detail_url is not None:
                detail_url = "http:"+detail_url
                time.sleep(1)
                yield scrapy.Request(url=detail_url,callback=self.pares_book,meta={"item":deepcopy(item)})

    def pares_book(self,response):
      item = response.meta["item"]
      item["book_name"] = "".join(response.css("title::text").re(BOOK_NAME))
      main_info = response.xpath("//script[@type='text/javascript']/text()")
      item["book_price"] = main_info.re(Price) if len(main_info.re(Price)) > 0 else None
      item["book_shop"] = main_info.re(Shop) if len(main_info.re(Shop)) > 0 else None
      item["book_publish"] = main_info.re(Publish) if len(main_info.re(Publish)) > 0 else None
      item["book_date"] = response.xpath("//div").re(Date) if len(response.xpath("//div").re(Date)) > 0 else None
      writer =response.xpath("//div").re(Writer) if len(response.xpath("//div").re(Writer)) > 0 else None
      item["book_writer"] = re.sub("(著)|(著作)|(者著)","",writer) if writer is not None else None
      item["book_phone"] = response.xpath("//div").re(Phone) if len(response.xpath("//div").re(Phone)) > 0 else None

      # 店家页的拼接
      num = response.url.split("00").split("/")
      if num:
            shop_url = f"http://product.suning.com/pds-web/ajax/getApiRemoteMap_{num}_shopScoreCallback.html?callback=shopScoreCallback"
            yield scrapy.Request(url=shop_url,callback=self.parse_shop,meta={"item":deepcopy(item)})

    def parse_shop(self,response):
      item = response.meta["item"]
      content = re.findall('parentIndexScore.":."(.*?).",',response.body.decode())
      print(response.body.decode())
      print(type(response.body.decode()))
      if len(content)==3:
            item["book_remark"] = content
            item["Logistics_score"] = content
            item["after_sell"] = content
            yield item













页: [1]
查看完整版本: python的爬虫进阶