[Asm] 纯文本查看 复制代码 # -*- coding:utf-8 -*-
"""
作者:4scp8syu@fhzzgfjjjsdzj.anonaddy.com
用途:爬取官网通知
日期:年月日
"""
import requests
import os
from lxml import etree
url="http://www.haue.edu.cn/xwdt/tzgg.htm"
# 创建一个文件夹
if not os.path.exists('./news'):
os.mkdir('./news')
header={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
}
re=requests.get(url=url,headers=header)
re.encoding='utf8'
html=etree.HTML(re.text)
# print(html
title=html.xpath('//*[@id="right"]/ul/li//a/text()')
lists=html.xpath('//*[@id="right"]/ul/li//a//@href')
# for i in title:
# print(i)
for j in lists:
j=j.replace("..","")
jurl='http://www.haue.edu.cn'+j
request=requests.get(url=jurl,headers=header)
request.encoding = 'utf8'
html = etree.HTML(request.text)
t=html.xpath('//*[@id="right"]/h1/text()')
# print(t)
content=html.xpath('//*[@id="right"]//text()')
content = ''.join(content)
# print(content)
file = './news/'+' '.join(t)+'.txt'
with open(file,'w',encoding='utf-8') as fp:
# 使用write方法进行保存文章内容
fp.write(content.replace(' ',''))
# fp.write(content.strip())
# fp.write(' '.join(content.split()))
print("over!")
|