本帖最后由 Leafmoes 于 2020-2-24 13:58 编辑
(Python学习第一天!本代码不完善,只能爬取https://www.xsbiquge.com这一个站点....而且也没有添加选想直接进入某个模式......(懒
下面是代码(感觉直接贴代码......不过代码有注释
[Python] 纯文本查看 复制代码 import urllib.request
from urllib import parse
import re
# 定义一些变量的初始值
list_content = ''
# 功能
# 搜索
def search(text):
text = parse.quote(text) # 把用户输入的关键词转码为url编码
search_url = 'https://www.xsbiquge.com/search.php?keyword=' + text
search_html = urllib.request.urlopen(search_url).read()
search_html = search_html.decode('utf-8')
search_html = search_html.replace('\n', '').replace('\r', '')
search_rule = r'<div class="result-item result-game-item">(.*?)</div>'
search_num = re.finditer(search_rule, search_html, re.S)
a = 0
for i in search_num:
# 标题
search_rule = r'<a cpos="title".*?<span>(.+?)</span>'
search_title = re.findall(search_rule, search_html, re.S)
search_title = '标题:' + search_title[a] + '\n'
# 简介
search_rule = r'<p class="result-game-item-desc">(.+?)</p>'
search_info = re.findall(search_rule, search_html, re.S)
search_info = '简介:' + search_info[a] + '\n'
# 作者
search_rule = r'<span class="result-game-item-info-tag-title preBold">作者:</span> +<span.*?>(.+?)</span>'
search_user = re.findall(search_rule, search_html, re.S)
search_user = '作者:' + search_user[a] + '\n'
# 类型
search_rule = r'<span class="result-game-item-info-tag-title preBold">类型:</span> +<span.*?>(.+?)</span>'
search_style = re.findall(search_rule, search_html, re.S)
search_style = '类型:' + search_style[a] + '\n'
# 更新时间
search_rule = r'<span class="result-game-item-info-tag-title preBold">更新时间:</span> +<span.*?>(.+?)</span>'
search_date = re.findall(search_rule, search_html, re.S)
search_date = '更新时间:' + search_date[a] + '\n'
# 最新章节
search_rule = r'<span class="result-game-item-info-tag-title preBold">最新章节:</span> +<a.*?href="(.*?)".*?>(.+?)</a>'
search_new = re.findall(search_rule, search_html, re.S)
search_new = '最新章节:' + search_new[a][1] + '\t' + search_new[a][0] + '\n'
# 总结获取的所有内容
search_content = search_title + search_info + search_user + search_style + search_date + search_new + '<——————————分隔符——————————>'
print(search_content)
a = a + 1
# 获取目录内容
def get_list(list_url):
global list_content
list_html = urllib.request.urlopen(list_url).read()
list_html = list_html.decode('utf-8')
list_rule = r'<dd><a href="(.+?)".*?>(.+?)</a></dd>'
list_content = re.findall(list_rule, list_html, re.S)
list_num = re.finditer(list_rule, list_html, re.S)
list_rule = r'<dt>(.+?)</dt>'
list_tittle = re.findall(list_rule, list_html, re.S)
a = 0
print(list_tittle[0])
for i in list_num:
content_tittle = list_content[a][1]
content_link = 'https://www.xsbiquge.com' + list_content[a][0]
print('[' + str(a + 1) + ']' + '\t' + content_tittle + '\t' + content_link)
a = a + 1
# 获取文章内容
def get_content(content_url):
html = urllib.request.urlopen(content_url).read()
html = html.decode('utf-8')
rule = r'<h1>(.*?)</h1>'
tittle = re.findall(rule, html, re.S)
tittle = tittle[0]
rule = r'<div id="content">(.+?)</div>'
content = re.findall(rule, html, re.S)
content = content[0]
content = tittle + '\n' + content
content = re.sub(r'<', '<', str(content))
content = re.sub(r'>', '>', str(content))
content = re.sub(r' ', ' ', str(content))
content = re.sub(r'<br />', '\n', str(content))
print(content)
# https://www.xsbiquge.com/63_63448/
print('\n本爬虫暂时只适配了一个网站,即 https://www.xsbiquge.com/')
word = input('请输入搜索内容!')
search(word)
url = input('请输入目录链接!')
get_list(url)
content_Num = input('请输入[]内的数字预览章节内容!')
get_content('https://www.xsbiquge.com' + list_content[int(content_Num) - 1][0])
可能写的不好,但是我会努力优化,里面要是有什么错误,或者更简单的写法(希望回帖与我交流,咱是刚学一天Python的新手呐 |