爬取学校通知
# -*- coding:utf-8 -*-
"""
作者:4scp8syu@fhzzgfjjjsdzj.anonaddy.com
用途:爬取官网通知
日期:年月日
"""
import requests
import os
from lxml import etree
url="http://www.haue.edu.cn/xwdt/tzgg.htm"
# 创建一个文件夹
if notos.path.exists('./news'):
os.mkdir('./news')
header={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
}
re=requests.get(url=url,headers=header)
re.encoding='utf8'
html=etree.HTML(re.text)
# print(html
title=html.xpath('//*[@id="right"]/ul/li//a/text()')
lists=html.xpath('//*[@id="right"]/ul/li//a//@href')
# for i in title:
# print(i)
for j in lists:
j=j.replace("..","")
jurl='http://www.haue.edu.cn'+j
request=requests.get(url=jurl,headers=header)
request.encoding = 'utf8'
html = etree.HTML(request.text)
t=html.xpath('//*[@id="right"]/h1/text()')
# print(t)
content=html.xpath('//*[@id="right"]//text()')
content = ''.join(content)
# print(content)
file = './news/'+' '.join(t)+'.txt'
with open(file,'w',encoding='utf-8') as fp:
# 使用write方法进行保存文章内容
fp.write(content.replace(' ',''))
# fp.write(content.strip())
# fp.write(' '.join(content.split()))
print("over!")
替换字符串或者用re提取 re=requests.get(url=url,headers=header) 这里用改成res, re是正则库,容易造成命名冲突
厉害厉害 解析HTML嘛,看看能不能找到接口 通知的正文,可以单独拿出来处理下。
正文xpath好像是“//*[@id="right"]/div/div/div” 感谢分享 还行吧哈哈 正好学习一波,不过还是有好多语句意思不懂。感谢分享
页:
[1]
2