一个多月前心血来潮学学了几小时的python爬虫写的,pandas没学,全靠边爬边从csdn直接copy来用的
代码是selenium无脑浏览器直接从加载完的网页抓的数据 (接口是有的,就是我想玩玩selenium)
目前有个bug 就是保存到excel后会丢失前面获取到的数据,也就是指保存了最后一页所爬取到的数据,或者就是直接保存了个空的excel
直接代码贴贴
[Python] 纯文本查看 复制代码 from selenium import webdriver
import sys
import selenium.webdriver.support.ui as ui
import time
import pandas as pd
import csv
# 检测还有没有下一页,没有了提示爬取完毕
def next_out():
wait = ui.WebDriverWait(driver, 10)
try:
clik = wait.until(lambda driver: driver.find_element_by_css_selector('div#pager .jsNxtPage'))
clik.click()
except:
print('爬取完毕')
return
price()
time.sleep(8)#这个延迟根据你的网络延迟设计即可
# 除第一页的所有内容
def price_f():
wait = ui.WebDriverWait(driver, 10)
# 延迟获取
a = []
b = []
c = []
data = {}
lcc = []
ljj = []
try:
lcc = wait.until(
lambda driver: driver.find_elements_by_css_selector("ul#lpBloc li div.prdBlocContainer div.prdtBloc"))
ljj = wait.until(lambda driver: driver.find_elements_by_xpath('//*[@id="lpBloc"]/li/div/div/a'))
except:
print('爬取失败')
return
for ll, lj in zip(lcc, ljj):
# lj = ll.find_elements_by_xpath('.//a').get_attribute("href")
jg = ll.find_element_by_css_selector('div.prdtPrice').text
yf = ll.find_element_by_css_selector('div.jsOverlay div span').text
data['价格'] = a.append(jg)
data['运费'] = b.append(yf)
data['链接'] = c.append(lj.get_attribute("href"))
print('价格:' + jg, '邮费:' + yf) ##获取到的价格和邮费
print('链接:' + lj.get_attribute("href")) ##获取链接
df2 = pd.read_excel(wjm + '.xlsx', sheet_name='黑猫')
df3 = pd.DataFrame({'价格': a, '运费': b, '链接': c}, columns=['价格', '运费', '链接'])
df3.pivot_table(index=['价格', '运费', '链接'], values='链接', aggfunc='first').reset_index()
df3.append(df2)
df3.to_excel(wjm + '.xlsx', sheet_name='黑猫')
next_out()
# 爬取第一页内容
def price():
wait = ui.WebDriverWait(driver, 10)
a = []
b = []
c = []
data = {}
lcc = []
ljj = []
try:
lcc = wait.until(lambda driver: driver.find_elements_by_css_selector("ul#lpBloc li div.prdtBloc"))
ljj = wait.until(lambda driver: driver.find_elements_by_xpath('//*[@id="lpBloc"]/li/div/a'))
except:
price_f()
return
for ll, lj in zip(lcc, ljj):
# lj = ll.find_elements_by_xpath('.//a').get_attribute("href")
jg = ll.find_element_by_css_selector('div.prdtPrice').text
yf = ll.find_element_by_css_selector('div.jsOverlay div span').text
data['价格'] = a.append(jg)
data['运费'] = b.append(yf)
data['链接'] = c.append(lj.get_attribute("href"))
print('价格:' + jg, '邮费:' + yf) ##获取到的价格和邮费
print('链接:' + lj.get_attribute("href")) ##获取链接
# with open('data.xlsx',mode='a',encoding='utf-8',newline="") as f:
# csv_write = csv.writer(f)
# csv_write.writerow([jg, yf, lj.get_attribute("href")])
df0 = pd.read_excel(wjm + '.xlsx', sheet_name='黑猫')
df1 = pd.DataFrame({'价格': a, '运费': b, '链接': c}, columns=['价格', '运费', '链接'])
df1.pivot_table(index=['价格', '运费', '链接'], values='链接', aggfunc='first').reset_index()
df1.append(df0)
df1.to_excel(wjm + '.xlsx', sheet_name='黑猫')
next_out()
def run(url):
# url = 'https://www.cdiscount.com/mpv-113939-blue-eyed-girl1949.html#_his'测试链接
# https://www.cdiscount.com/b-408536-soldes-maison.html
driver.get(url)
price()
if __name__ == '__main__':
url = input('输入链接:')
wjm = input('输入要保存的文件名:')
df = pd.DataFrame()
df.to_excel(wjm + '.xlsx', sheet_name='黑猫')
# 先创建一个excel~
driver = webdriver.Chrome(executable_path='chromedriver.exe')
run(url)
心血来潮学了几小时就再没碰过,这个学期开学突然要血python ps:现在学后端也要学python的吗?我不是java开发方向的吗
如果有大佬能修复优化一下就更好了,
如果帖子有什么违规的地方麻烦管理帮忙删一下,谢谢了
|