【python入门爬虫】爬取笔趣阁小说
本帖最后由 a13381041 于 2020-5-13 17:53 编辑入门python练手的爬虫脚本,用于爬取笔趣阁小说(搜索,爬取小说属性,章节内容)
如果以后要写小说软件的可以借鉴
import time
from bs4 import BeautifulSoup
import requests
import urllib.parse
#模拟小说搜索
def search(url):
print("访问:"+url)
#请求页面
response = requests.get(url)
#获取真实地址
url = response.url
#防止中文乱码,参考html的meta标签的content.charset属性
response.encoding ='gbk'
#获取html内容
html_content = response.text
#print(html_content)
#转为bs
soup=BeautifulSoup(html_content,"lxml")
searchList = []
#获取搜索的结果列表
i=0
if(len(soup.select('#main li'))<=0):
if(soup.title.string =='笔趣阁'):
return []
else:
name = soup.select('#info > h1').string
url = url
author = soup.select('#info > p:nth-child(2) > a').string
novel = {"name":name,"url":url,"author":author}
print(("""id:%d\t书名:%s\t作者:%s""" %(i, name, author)))
searchList.append(novel)
return searchList
else:
for child in soup.select('#main li'):
name = child.select('.s2 a').string
url = child.select('.s2 a').get('href')
author = child.select('.s4').string
novel = {"name":name,"url":url,"author":author}
searchList.append(novel)
print(("""id:%d\t书名:%s\t作者:%s""" %(i, name, author)))
i+=1
return searchList
#爬取小说属性
def getNovelAtrr(url):
print("访问:"+url)
#请求页面
response = requests.get(url)
#防止中文乱码,参考html的meta标签的content.charset属性
response.encoding ='gbk'
#获取html内容
html_content = response.text
#print(html_content)
#转为bs
soup=BeautifulSoup(html_content,"lxml")
#获取小说名,作者,简介,更新日期,字数,目录
name = soup.select('#info h1').string
author = soup.select('#info > p:nth-child(2) > a').string
profile = soup.select('#intro').text
updata_wordnum = str(soup.select('#info > p:nth-child(4)').text)
index = updata_wordnum.find("[")
lastindex = updata_wordnum.find("字")
updataTime = updata_wordnum
wordnum = updata_wordnum
catalogList = []
for item in soup.select('#list > dl > dd a'):
value = url + item.get("href")
name = item.text
catalog = {name:value}
catalogList.append(catalog)
Novel = {"name":name,"url":url,"profile":profile,"author":author,"updataTime":updataTime,"wordnum":wordnum,"catalogList":catalogList}
return Novel
#打开小说章节
def openCatalog(url):
print("访问:"+url)
#请求页面
response = requests.get(url)
#防止中文乱码,参考html的meta标签的content.charset属性
response.encoding ='gbk'
#获取html内容
html_content = response.text
#print(html_content)
#转为bs
soup=BeautifulSoup(html_content,"lxml")
content = soup.select('#content').text
print(content)
#选择搜索页的某本书,并访问某章节
def openNovel(id,searchList):
if id>=len(searchList):
print("没有这本书")
else:
Novel = getNovelAtrr(searchList["url"])
page = int(0)
limit = int(10)
while True:
for i in range(limit):
print(("""id:%d\t %s""" %(page*limit+i, Novel["catalogList"])))
cmdid = int(input("输入‘-1’上一页,输入‘-2’下一页,输入章节id访问章节:"))
if(cmdid == -1):
if(page>0):
page=page-1
elif(cmdid == -2):
if(page*limit<len(Novel["catalogList"])):
page=page+1
elif(cmdid >= 0):
url = Novel["catalogList"]
for key in url:
openCatalog(url)
break
if __name__ == '__main__':
searchUrl = 'https://www.52bqg.com/modules/article/search.php?searchkey='
word = str(input("输入搜索关键字:"))
#拼接链接,搜索关键字必须进行url转码
url = searchUrl + urllib.parse.quote(word.encode('gbk'))
searchList = search(url)
if(len(searchList) == 0):
print("检索失败!")
else:
openNovel(int(input("输入小说id:")),searchList)
getNovelAtrr("https://www.52bqg.com/book_361/")
修改了查询结果唯一,没有显示查询结果的bug
添加了目录翻页功能 chapter_urls= html.xpath('//*[@id="chapter"]/div/div/ul/div/li/a/@href')
chapter_urls= html.xpath('//*[@id="chapter"]/div/div/ul/div/div/li/a/@href')
chapter_urls= html.xpath('//*[@id="chapter"]/div/div/ul/div/div/div/li/a/@href')
chapter_urls= html.xpath('//*[@id="chapter"]/div/div/ul/div/div/div/div/div/li/a/@href')
chapter_urls= html.xpath('//*[@id="chapter"]/div/div/ul/div/div/div/div/div/div/li/a/@href')
chapter_urls= html.xpath('//*[@id="chapter"]/div/div/ul/div/div/div/div/div/div/div/li/a/@href')
这一段怎么优化或简化 ghoob321 发表于 2020-7-23 08:34
chapter_urls= html.xpath('//*[@id="chapter"]/div/div/ul/div
通过浏览器的F12,然后选中想要爬取的节点,右键copy->copy selector可以复制到比较简洁的节点信息 学习一下这个技巧 收藏备用 先收藏,等我学了Python再来康2333;www 收藏了 有时间看看
比我写的有质量多了 这个是PY几的,先学习学习一些基础的爬虫知识先,到时可以去爬爬其他的东西{:1_905:} 感谢楼主分享 涨知识了,谢谢大神 不错,测试可以用,不过刚安装这个软件,新手啊,完全看不懂,学习一下