PhantomJS与driver结合,爬取全网商品的历史价格
【1】PhantomJS是一个很小的浏览器,没有界面,只执行访问请求和爬取数据,比起火狐和谷歌少了很多不必要的开销,会提升爬取数据的效率【2】driver是一个很好的定位工具,代码编写很简单
【3】小编通过一个vvv的网站获取到淘宝京东苏宁拼多多的商品历史价格,再发送到android客户端进行展示,相当于是做了个比价的android app,
在此分享python 的脚本及android apk(android客户端的代码还有用,暂时不分享,后续会更新),希望大家能给一下用户体验敢想哈
【4】apk下载链接:https://fir.im/1z94
【5】python脚本:接收客户端的商品链接,从vvv中爬取相关历史变化价格图并保存在本地,然后再发送给客户端,供给正在学爬虫的同学学习~
import requests
import socket
import threading
import os
import time
from selenium import webdriver
from PIL import Image
driver=webdriver.PhantomJS(executable_path=r"C:\Users\Administrator\Desktop\phantomjs.exe")
def get_target_url(url):
driver.get("http://www.hisprice.cn/")
driver.find_element_by_id("kValId").send_keys(url)
driver.find_element_by_css_selector("button").click()
time.sleep(1)
def get_title():
try:
title=driver.find_element_by_id("titleId").text
print(title)
if title=="":
title="获取标题失败!"
return title
except:
return "获取标题失败!"
def get_price():
try:
price=driver.find_element_by_id("minMaxDivId").text
print(price)
if price=="":
price="获取最低最高价格失败!"
return price
except:
return "获取最低最高价格失败!"
def get_snap(driver,file_path):# 对目标网页进行截屏,这里截的是全屏
driver.save_screenshot(file_path)
page_snap_obj=Image.open(file_path)
return page_snap_obj
def get_image(driver,file_path): # 对价格变化图所在位置进行定位,然后截取验证码图片
img=driver.find_element_by_id("container")
time.sleep(2)
location = img.location
print(location)
size = img.size
left = location['x']
top = location['y']
right = left + size['width']
bottom = top + size['height']
page_snap_obj = get_snap(driver,file_path)
image_obj = page_snap_obj.crop((left, top, right, bottom))
image_obj.save(file_path)
return image_obj# 得到价格变化图
def handle_client(client_socket, client_id):
"""处理客户端请求"""
# 获取客户端请求数据
while True:
try:
request_data = client_socket.recv(1024)
except Exception:
time.sleep(0.2)
continue
if len(request_data) > 0:
#保存价格变化图
data_url=str(request_data,encoding="utf_8")
now_time = time.strftime("%d-%H-%M-%S", time.localtime(time.time()))
file_path=r"C:\Users\Administrator\Desktop\price_Search\picture./"+now_time +".png"
get_target_url(data_url)
get_image(driver,file_path)
# 将值传入对象
try:
length=os.path.getsize(file_path)
st=str(length)
title=get_title()
price=get_price()
fst=title+"^"+price+"^"+st
client_socket.send(bytes(fst,"utf-8"))
time.sleep(1)
try:
data=client_socket.recv(1024)
if data==b"ok":
fp=open(file_path,"rb")
while True:
data=fp.read(1024)
if not data:
print("send all")
break
client_socket.send(data)
client_socket.close()
else:
print("have no data")
except:
print("发送失败")
st0="no"
dt0= bytes(st0, encoding="utf8")
client_socket.sendall(dt0)
break
except:
st0="no"
dt0= bytes(st0, encoding="utf8")
client_socket.sendall(dt0)
print("凉了,该商品没有被收录")
break
hostname = socket.gethostname()
host = socket.gethostbyname(hostname)
port = 8000
BUFSIZE = 1024
ADDR = (host, port)
if __name__ == "__main__":
server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
"""
socket()是一个函数,创建一个套接字,
AF_INET 表示用IPV4地址族,
SOCK_STREAM 是说是要是用流式套接字
"""
# server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) # 设置地址重用
server_socket.bind(ADDR) # 绑定端口
server_socket.listen(2) # 开启监听
client_socket_list = []
client_num = 0
Isready = False
while True:
client_id = client_num
client_socket, client_address = server_socket.accept()
print("[%s, %s]用户连接上了" % client_address)
handle_client_thread = threading.Thread(target=handle_client, args=(client_socket, client_id))
"""
tartget表示这个进程到底要执行什么行为
args是target要接受的参数
"""
client_socket_list.append(client_socket)
client_num += 1
client_socket.setblocking(0)
handle_client_thread.start()
支持楼主 大佬~ 貌似拼多多搞不到啊~ zhengpengxin 发表于 2019-12-8 13:53
大佬~ 貌似拼多多搞不到啊~
感谢你的建议,我检查一下哈 支持支持
页:
[1]