本帖最后由 super谦 于 2020-12-9 16:15 编辑
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
from pyquery import PyQuery as pq
import pandas as pd
from selenium.webdriver.chrome.options import Options
from time import sleep
chrome_options = Options()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome()
wait = WebDriverWait(browser, 10)
data = pd.DataFrame()
browser.set_window_size(1400, 900)
def search():
print('正在搜索')
try:
browser.get('https://www.taobao.com')
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))
)
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button')))
input.send_keys('美食') # 这里输入你要搜索的关键字
submit.click()
zh()
infor()
total = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total'))
)
return total.text
except TimeoutException:
search()
def zh():
print('正在登录') # 因为我的账号还是ip好像被检测到了有爬虫嫌疑,所以每次搜索会跳转到登录页面。没被检测过的,搜索是不用登录的
try:
input_zh = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#fm-login-id'))
)
input_key_words = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#fm-login-password'))
)
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#login-form > div.fm-btn > button')))
input_zh.send_keys('xxxx') # 这里xxx为输入的账号
input_key_words.send_keys('xxxx') # 这里xxxx为输入的账号密码
submit.click()
except TimeoutException:
zh()
def next_page(page_num):
print('正在翻页,目前到第{}页'.format(page_num))
sleep(5) # 因为防止淘宝反爬,所以设置5秒翻页一次
try:
input_page_num = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input'))
)
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
input_page_num.clear()
input_page_num.send_keys(page_num)
submit.click()
wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page_num)))
infor()
except TimeoutException:
next_page(page_num)
def infor():
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item')))
html = browser.page_source
doc = pq(html)
items = doc('#mainsrp-itemlist .items .item').items()
for item in items:
try:
img = item.find('.pic .img').attr('src') # 图片
except:
img = '无'
try:
price = item.find('.price').text() # 价格
except:
price = '无'
try:
deal = item.find('.deal-cnt').text()[:-3] # 出售数量
except:
deal = '无'
try:
goods = item.find('.title').text() # 商品名字
except:
goods = '无'
try:
shop = item.find('.shop').text() # 店铺名字
except:
shop = '无'
try:
location = item.find('.location').text() # 地区
except:
location = '无'
information = {
"img": img,
"price": price,
"deal": deal,
"goods": goods,
"shop": shop,
"location": location
}
global data
if data.empty:
data = pd.DataFrame(information, index=[0])
else:
data = data.append(information, ignore_index=True)
def main():
browser.get("http://httpbin.org/ip")
print(browser.page_source)
total = search()
total = int(re.compile('(\d+)').search(total).group(1))
for i in range(2, total): # total就是爬取的页数,这里我是自动获取了最大页数,也可以自己设置
next_page(i)
data.to_csv(r'D:\python work\taobao_spider\infor\meishi1.csv', encoding='utf-8') # 保存的路径修改成自己的
browser.close()
print("爬虫完成")
if __name__ == '__main__':
main()
由于登录涉及隐私,所以我没有展示
唯一缺点是,爬取的图片,每页只有前面10个左右是有效的,不知道是什么问题,如果有大佬懂的话,希望能指点指点
今天研究了一下关于爬取图片不全的问题,在经过几次测试后发现,如果滚动条不下滑,图片将不会加载出来,从而导致我们拿不到图片,于是我试了下直接把滚动条拉到尾部,发现也是不可行的,只能是慢慢下滑滚动条才会加载图片,于是我就试了下获取滚动条的高度,然后每次下拉高度的1/5,拉5次,最后所有图片都能爬下来,我们只要修改infor()函数的部分代码即可。
def infor():
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item .pic')))
# wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '')))
h = browser.execute_script('var lenOfPage=document.body.scrollHeight; return lenOfPage;') # 拿到滚动条的高度
cs = 0
st = 0
over = h//5
while cs < 4: # 这里是设置了下拉4次
js = "window.scrollTo({},{})".format(st, over)
browser.execute_script(js)
st += h//5 # 这里你可以设置下拉的长度,占总高度的比例
over += h//5 # 这里你可以设置下拉的长度,占总高度的比例
cs += 1
sleep(3) # 每次下拉要休息3秒
html = browser.page_source
doc = pq(html)
items = doc('#mainsrp-itemlist .items .item').items()
for item in items:
try:
img = item.find('.pic .img').attr('src')
except:
img = '无'
try:
price = item.find('.price').text()
except:
price = '无'
try:
deal = item.find('.deal-cnt').text()[:-3]
except:
deal = '无'
try:
goods = item.find('.title').text()
except:
goods = '无'
try:
shop = item.find('.shop').text()
except:
shop = '无'
try:
location = item.find('.location').text()
except:
location = '无'
information = {
"img": img,
"price": price,
"deal": deal,
"goods": goods,
"shop": shop,
"location": location
}
global data
if data.empty:
data = pd.DataFrame(information, index=[0])
else:
data = data.append(information, ignore_index=True)```
|