本帖最后由 suny1925 于 2022-5-30 09:12 编辑
初次发帖,多多关照。
[Python] 纯文本查看 复制代码 import os
import requests
import html
from bs4 import BeautifulSoup
import pymysql
# 打开数据库
db = pymysql.connect(host="localhost", user="root", password="root", db="szxx")
hd = {'user-agent': 'chrome/10'}
def download_all_html(id):
try:
url = 'https://*.com/html/' + str(id) + '.html' # 内容页网址,自己设置
request = requests.get(url)
return request.text
except:
# print('download page ' + id + ' error')
return ''
def parse_single_html(id):
try:
# print("HTML:" + html)
soup = BeautifulSoup(download_all_html(id), 'html.parser')
divs = soup.find_all('div', {'class': 'site-list'})
if not divs:
return '';
title = divs[0].find('h3', {'class': 'panel-title'}).text.strip()
content = divs[0].find('div', {'id': 'frameContent'}).text.strip()
imgs = divs[0].find_all('img')
for img in imgs:
path = img['src']
tmp = path.replace(':', '')
if path.find('?') > -1:
filename = tmp[:tmp.index('?')].replace('https//*.com/uploadimages/', '') #图片地址解析
else:
filename = tmp
filepath = os.path.dirname(os.path.realpath('e:/szxx/{}'.format(filename)))
if not os.path.exists(filepath):
os.makedirs(filepath, 0o777)
try:
response = requests.get(path)
with open('e:/szxx/{}'.format(filename), 'wb') as f:
f.write(response.content)
except:
pass
cur = db.cursor()
sql_sel = "select count(*) from shijuan where id=" + str(id)
cur.execute(sql_sel)
results = cur.fetchall()
content = html.escape(content)
if results[0][0] > 0:
sql = "update shijuan set title = '" + title + "', content='" + content + "' where id="+str(id)
else:
sql = "insert into shijuan(`id`,`title`,`content`) values (" + str(id) + ", '" + title + "', '" + content + "')"
# print(sql)
try:
cur.execute(sql)
# 提交
db.commit()
print('No.' + str(id) + ' download success')
except Exception as e:
# 错误回滚
db.rollback()
except Exception as e:
print('**No.' + str(id) + ' download error')
print(e.with_traceback())
for i in range(1, 10000, 1):
parse_single_html(i)
db.close()
|