这是我写的一个漫画网站的漫画爬取:[Python] 纯文本查看 复制代码 from selenium import webdriver
from time import sleep
import re
import requests
from lxml import etree
import os
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36 Edg/94.0.992.47"}
def mkdir(path):
#保存文件
folder = os.path.exists(path)
if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹
os.makedirs(path) # makedirs 创建文件时如果路径不存在会创建这个路径
else:
pass
def huode_url(url):
#获取页面数据
drive = webdriver.Edge()
drive.minimize_window()
drive.get(url)
sleep(2)
text = drive.page_source
drive.close()
html = etree.HTML(text)
return html
def get_tupian_url(text):
#获取漫画图片链接
manhualianjie_url_list = text.xpath('//img/@data-src')
return manhualianjie_url_list
def chuangjian_wenjianjia(text,manhualianjie_url_list):
#创建目录
manhuazhangjie = text.xpath('//head/title/text()')
manhuaminglist = text.xpath('//div/a/text()')
manhuaming = str(manhuaminglist[1])
q = str(manhuazhangjie[0])
print(q)
ret = re.split(r":| ", q)
outfile = 'E:\{}\{}'.format(manhuaming, ret[0])
folder = os.path.join(os.getcwd(), outfile)
mkdir(folder)
xiazai_tupian(manhualianjie_url_list,ret,outfile)
def xiazai_tupian(manhualianjie_url_list,ret,outfile):
#下载漫画图片
for item_url in manhualianjie_url_list:
response = requests.get(item_url, headers=headers).content
path = '{}{}'.format(ret[0], ret[1])
with open("{}/{}{}.jpg".format(outfile, path, manhualianjie_url_list.index(item_url) + 1),
'wb', ) as f:
f.write(response)
def shangyizhang_url(text):
#获取上一章的链接
shangyizhang_url_list = text.xpath('//a/@href')
shangyizhang_url = 'http://www.sixmh7.com' + shangyizhang_url_list[len(shangyizhang_url_list) - 3]
return shangyizhang_url
def tianjian(text):
#抓取上一章的链接是否和漫画主页一样,并停止脚本
shangyizhang_url_list = text.xpath('//a/@href')
if shangyizhang_url_list[1] == shangyizhang_url_list[len(shangyizhang_url_list) - 3]:
return False
else:
return True
def main():
url = 'http://www.sixmh7.com/16081/1300180.html'
#漫画的网站
i = True
while i == True:
html = huode_url(url)
manhualianjie_url_list = get_tupian_url(html)
chuangjian_wenjianjia(html,manhualianjie_url_list)
url = shangyizhang_url(html)
i = tianjian(html)
sleep(5)
print('结束')
if __name__=='__main__':
main()
使用了selenium爬取网页源代码,os保存文件到哦本地,re和lxml 解析网页源代码的元素及内容。
该脚本不足地方:
1、未添加搜索脚本
2、selenium调用Edge时会时不时弹窗 |