[Python] 纯文本查看 复制代码 import requests
from lxml import etree
import os
def makedirs(dir_name): # 创建目录
if os.path.isdir(dir_name) == False:
os.makedirs(dir_name)
makedirs('./che/')
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'
}
def get_attr(el,path,attr):
els = el.xpath(path)
if len(els)>0 :
return els[0].get(attr)
else:
return ''
def get_text(el,path):
els = el.xpath(path + '//text()')
if len(els)>0 :
return els[0]
else:
return ''
for i in range(0,26):
url = "http://www.chebiao.cc/chebiaodaquan/" + chr(i+97) #用数字拼接网址
print(url)
req = requests.get(url,headers=headers)
req.encoding = 'utf8'
#print(req.apparent_encoding)
html = req.text #用requests.get()函数获得拼接网址的数据
#print(html) #打印显示一下
root = etree.HTML(html) #转换为xpath可用的格式
images = root.xpath('//div[@class="mainbox"]/a')
print('共' + str(len(images)) + '个')
for img in images:
try:
imgurl = get_attr(img,'./img','src')
title = get_text(img,'./p[position()=1]')
subtitle = get_text(img,'./p[position()=2]')
#print(imgurl,title,subtitle)
file_name = './che/' + chr(i+65) + '_' + title + '_' + subtitle + '.' + imgurl.split('.')[-1] #表情包名字就取网址中的最后一个
print(file_name)
res = requests.get(imgurl).content #获得二进制数据
with open (file_name, 'wb') as f: #用“wb”模式打开,没有就新建,肯定是需要自动新建的
f.write(res) #将获得的二进制数据写到文件中
except Exception as e:
print(e)
print("车标抓取完成!")
|