PhantomJS与driver结合，爬取全网商品的历史价格

Darren_Smith · 发表于 2019-12-8 10:59

【1】PhantomJS是一个很小的浏览器，没有界面，只执行访问请求和爬取数据，比起火狐和谷歌少了很多不必要的开销，会提升爬取数据的效率
【2】driver是一个很好的定位工具，代码编写很简单
【3】小编通过一个vvv的网站获取到淘宝京东苏宁拼多多的商品历史价格，再发送到android客户端进行展示，相当于是做了个比价的android app，
在此分享python 的脚本及android apk(android客户端的代码还有用，暂时不分享，后续会更新)，希望大家能给一下用户体验敢想哈
【4】apk下载链接：https://fir.im/1z94

【5】python脚本：接收客户端的商品链接，从vvv中爬取相关历史变化价格图并保存在本地，然后再发送给客户端，供给正在学爬虫的同学学习~

import requests
import socket
import threading
import os
import time
from selenium import webdriver
from PIL import Image

driver=webdriver.PhantomJS(executable_path=r"C:\Users\Administrator\Desktop\phantomjs.exe")
def get_target_url(url):
driver.get("http://www.hisprice.cn/")
driver.find_element_by_id("kValId").send_keys(url)
driver.find_element_by_css_selector("button").click()
time.sleep(1)
def get_title():
try:
      title=driver.find_element_by_id("titleId").text
      print(title)
      if title=="":
         title="获取标题失败!"
      return title
except:
      return "获取标题失败！"
def get_price():
try:
      price=driver.find_element_by_id("minMaxDivId").text
      print(price)
      if price=="":
         price="获取最低最高价格失败!"
      return price
except:
      return "获取最低最高价格失败！"
def get_snap(driver,file_path):  # 对目标网页进行截屏,这里截的是全屏
driver.save_screenshot(file_path)
page_snap_obj=Image.open(file_path)
return page_snap_obj

def get_image(driver,file_path): # 对价格变化图所在位置进行定位，然后截取验证码图片
img=driver.find_element_by_id("container")
time.sleep(2)
location = img.location
print(location)
size = img.size
left = location['x']
top = location['y']
right = left + size['width']
bottom = top + size['height']

page_snap_obj = get_snap(driver,file_path)
image_obj = page_snap_obj.crop((left, top, right, bottom))
image_obj.save(file_path)
return image_obj  # 得到价格变化图
def handle_client(client_socket, client_id):
  """处理客户端请求"""
  # 获取客户端请求数据
  while True:
try:
   request_data = client_socket.recv(1024)
except Exception:
   time.sleep(0.2)
   continue
if len(request_data) > 0:
      #保存价格变化图
   data_url=str(request_data,encoding="utf_8")
   now_time = time.strftime("%d-%H-%M-%S", time.localtime(time.time()))
   file_path=r"C:\Users\Administrator\Desktop\price_Search\picture./"+now_time +".png"
   get_target_url(data_url)
   get_image(driver,file_path)
   # 将值传入对象
   try:
      length=os.path.getsize(file_path)
      st=str(length)
      title=get_title()
      price=get_price()
      fst=title+"^"+price+"^"+st
      client_socket.send(bytes(fst,"utf-8"))
      time.sleep(1)
      try:
         data=client_socket.recv(1024)
         if data==b"ok":
            fp=open(file_path,"rb")
            while True:
            data=fp.read(1024)
            if not data:
               print("send all")
               break
            client_socket.send(data)
            client_socket.close()
         else:
            print("have no data")
      except:
            print("发送失败")
            st0="no"
            dt0= bytes(st0, encoding="utf8")
            client_socket.sendall(dt0)
            break

   except:
      st0="no"
      dt0= bytes(st0, encoding="utf8")
      client_socket.sendall(dt0)
      print("凉了,该商品没有被收录")
      break

hostname = socket.gethostname()
host = socket.gethostbyname(hostname)
port = 8000
BUFSIZE = 1024
ADDR = (host, port)
if __name__ == "__main__":
  server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
  """
  socket（）是一个函数，创建一个套接字，
  AF_INET 表示用IPV4地址族，
  SOCK_STREAM 是说是要是用流式套接字
  """
  # server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) # 设置地址重用
  server_socket.bind(ADDR) # 绑定端口
  server_socket.listen(2) # 开启监听
  client_socket_list = []
  client_num = 0
  Isready = False

  while True:
client_id = client_num
client_socket, client_address = server_socket.accept()
print("[%s, %s]用户连接上了" % client_address)
handle_client_thread = threading.Thread(target=handle_client, args=(client_socket, client_id))
"""
tartget表示这个进程到底要执行什么行为
args是target要接受的参数
"""
client_socket_list.append(client_socket)
client_num += 1
client_socket.setblocking(0)
handle_client_thread.start()

亂爱 · 发表于 2019-12-8 13:11

提示: 作者被禁止或删除内容自动屏蔽

钒事钒事 · 发表于 2019-12-8 13:20

支持楼主

zhengpengxin · 发表于 2019-12-8 13:53

大佬~ 貌似拼多多搞不到啊~

Darren_Smith · 发表于 2019-12-10 20:16

zhengpengxin 发表于 2019-12-8 13:53
大佬~ 貌似拼多多搞不到啊~

感谢你的建议，我检查一下哈

MOEYU_VANILLA · 发表于 2019-12-10 22:19

支持支持

帐号		自动登录	找回密码
密码			注册[Register]

亂爱亂爱当前离线好友阅读权限 0 听众最后登录 1970-1-1 头像被屏蔽	亂爱发表于 2019-12-8 13:11 提示: 作者被禁止或删除内容自动屏蔽
亂爱亂爱当前离线好友阅读权限 0 听众最后登录 1970-1-1 头像被屏蔽	【吾爱破解论坛总版规】 - [让你充分了解吾爱破解论坛行为规则]
	回复支持举报

[Python 转载] PhantomJS与driver结合，爬取全网商品的历史价格

免费评分