本帖最后由 XCHLX 于 2023-6-2 15:37 编辑
[Python] 纯文本查看 复制代码 import requests # 导入requests包
from bs4 import BeautifulSoup
import my_fake_useragent as ua
import json
from lxml import etree
class BiQuGe(object):
def __init__(self):
self.proxies = None # 代{过}{滤}理
self.header = None # 请求头
self.set_proxies()
# 搜索小说
def get_book(self, searchKey):
try:
text = self.geturl(("http://www.xsbiquge.org/search?searchkey=" + searchKey))
if text is None:
return []
bookList = []
searchHtml = etree.HTML(text)
searchList = searchHtml.xpath('//*[@class="category-div"]')
for i in searchList:
bookList.append({"title": self.getstr(i.xpath('div/div/a/h3/text()')),
"img": self.getstr(i.xpath('a/img/@data-original')),
"path": self.getstr(i.xpath('a/@href')),
"abt": self.getstr(i.xpath('div/div/span/text()')),
"desc": self.getstr(i.xpath('div/div[contains(@class,"intro")]/text()')),
})
return bookList
except Exception as e:
self.del_proxies()
return None
# 获取小说目录
def get_directory(self, dir, currentPage=1, pageSize=500):
try:
text = self.geturl(("http://www.xsbiquge.org" + dir))
if text is None:
return []
dirList = []
searchHtml = etree.HTML(text)
searchList = searchHtml.xpath('//div[contains(@class,"flex-wrap")]')[1].xpath(
'a[position()<={0} ]'.format(int(currentPage) * pageSize))
p = ((int(currentPage) - 1) * pageSize)
searchList = searchList[p:]
for i in searchList:
dirList.append({"title": self.getstr(i.xpath('text()')),
"path": self.getstr(i.xpath('@href'))})
return dirList
except Exception as e:
self.del_proxies()
return None
# 获取章节内容
def get_content(self, title):
try:
text = self.geturl(("http://www.xsbiquge.org" + title))
if text is None:
return []
bs = BeautifulSoup(text, "html.parser")
contentStr = []
contentList = bs.find_all(class_="content")[0].find_all("p")
for item in contentList:
if len(item.string) > 1:
contentStr.append(item.string)
self.r.set('xsbiquge', json.dumps(self.proxies), ex=600)
return contentStr
except Exception as e:
self.del_proxies()
return []
def get_content_all(self, title):
try:
contentStr = []
contentStr = contentStr + self.get_content(title)
contentStr = contentStr + self.get_content(title.replace(".html", "_2.html"))
return contentStr
except Exception as e:
return e
# 重试
def geturl(self, url):
text = None
for i in range(5):
text = self.__me_post(url)
if text is not None:
break
self.del_proxies()
self.set_proxies()
if text is None:
self.del_proxies()
print(self.proxies + "失败")
return None
else:
return text
# 删除失败代{过}{滤}理
def del_proxies(self):
pass
# 设置代{过}{滤}理 和 请求头
def set_proxies(self):
self.header = {
"User-Agent": ua.UserAgent().random(),
}
# post 请求
def __me_post(self, url):
try:
response = requests.post(url, headers=self.header,
proxies=self.proxies, timeout=1) # Get方式获取网页数据
if response.status_code != 200:
self.del_proxies()
return None
else:
response.encoding = 'utf-8'
return response.text
except:
return None
# get 请求
def __me_get(self, url):
response = requests.get(url, headers=self.header,
proxies=self.proxies, timeout=1) # Get方式获取网页数据
if response.status_code != 200:
self.del_proxies()
return None
else:
response.encoding = 'utf-8'
return response.text
# 提取字符串
def getstr(self, arr):
if len(arr) > 0:
return arr[0].strip()
return None
|