通用在线小说阅读爬虫

ccwuax · 发表于 2021-8-20 14:50

看了很多Python的爬虫例子，就试着学习，发现大多数小说网站爬虫其实大同小异，就想试着写一个通用的，不过想想容易做做难，不同网站对于爬虫的处理还是有些许不同，折个中，改成不同网站不同配置，我自己用百度搜了在线小说，已经写了3个配置，其他网站看懂的应该自己能写上去！
顺便说一下爬取思路，首先根据文章目录取得章节标题文字和相应的url,再爬取内容存入文件即可，利用xpath定位，没有使用多线程、协程，一来主要是为学习，提供思路，二来我自己并不看小说，大规模多线程会对服务器会造成压力
直接贴源码：

[Python] 纯文本查看 复制代码

from lxml import etree
from os import makedirs,path,getcwd, system
import requests
from time import sleep
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)#移除警告
is_prox=False
proxies =None# {"http": "http://127.0.0.1:10809"}#使用代{过}{滤}理
mysession = requests.Session()#创建session对象
ser_num = 0 #输出时的计数
allow_redirects=False
rooturl=cata_url=title_xpath=url_xpath=filename_xpaht=content_xpath=""
def isconnect_enable(_url):
    try:
        # if proxies !=None:
        requests.get(_url,proxies=proxies)
        print("网站可以连通")
        return True
    except :
        print("网站无法连接")
        return False
def gethtml(_url):#根据网址取得网页html源码
    header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36",
    }
    try_count=5#尝试次数
    while True:
        try:
            print("正在获取HTML,网址:",_url,end="")
            html = mysession.get(_url, headers=header,proxies=proxies,allow_redirects=allow_redirects)
            # html = html.content
            html.encoding = html.apparent_encoding#直接执行encoding为正确编码
            # print(html)
            mysession.close()
            return html.text
        except requests.exceptions.RequestException:
            if try_count<=0:
                print("\n获取HTML失败，程序退出！")
                exit(0)
            sleep(0.1)
            print(".",end="")
            try_count-=1
def get_title_titleurl(_url,title_xpath,url_xpath,filename_xpath):#根据目录页提取文章题目和url,返回字典和小说名字（文件名）
    '''_url:目录面网址
       title_xpath：题目的xpath,记得加text()哦
       url_xpath：内容页网址的xpaht,记得加@href
       filename_xpath:获取小说题目的xpath
    '''
    html = gethtml(_url)#取得列表网址源码
    tree = etree.HTML(html,etree.HTMLParser())
    filename = str(tree.xpath(filename_xpath)[0]).strip()
    title_dict = {}#返回的字典
    title_list = tree.xpath(title_xpath)
    url_list   = tree.xpath(url_xpath)#这里要注意,默认为全路径，如果是相对路径需要加根url哦
    for i in range(len(title_list)):
        title_dict[str(title_list[i]).strip()]=url_list[i]
    return title_dict,filename#返回字典和小说名字（文件名）
def get_file_content(url,content_xpath):#取得文章内容
    userAgent = r"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
    header = {'User-Agent': userAgent}
    global ser_num
    print (ser_num,"\t正在获取网页内容...",end="")
    ser_num+=1
    postUrl=url
    responseRes = mysession.get(postUrl,proxies=proxies,headers = header,verify=False)
    mysession.close()#关闭连接
    responseRes.encoding='utf-8'
    # print("返回状态码：",responseRes.status_code,end="\t")
    if str(responseRes.status_code)=="200":
        print("获取成功!",end="\t")
    tree =etree.HTML(responseRes.text)
    str_content = tree.xpath(content_xpath)
    str_content="".join(str_content).replace("\r\n","\n")#清除一下多余的空行
    return str_content
def write_to_file(_dirs,_filename,_content,chapter="",_extfn=".txt",mode="w",_prefix="",_fileurl=""):#写入到文件
    '''
    _dirs:目录
    _prefix:文件名前缀
    _filename:文件名
    _content:文件内容
    _extfn:文件扩展名
    mode：写入模式
    _fileurl:本文网址
    ''' 
    if chapter!="":
        # _filename=_filename+"\t"+chapter
        # chapter
        print("写入->文件夹:{0}\t文件名:{1}".format(_dirs,_prefix+_filename),"\t"+chapter)
    else:
        print("写入->文件夹:{0}\t文件名:{1}".format(_dirs,_prefix+_filename))
    if not path.exists(_dirs):
        makedirs(_dirs)
    with open(_dirs+"\\"+_prefix+_filename+".txt",mode,encoding = "utf-8") as myfile:
        # myfile.write(_filename+"\n")
        if _fileurl!="":
            myfile.write("本文网址:"+str(_fileurl)+"\n")
        myfile.write(_content)
def config(site_choice,bookid):
    if site_choice=="纵横":
        global rooturl,cata_url,title_xpath,url_xpath,filename_xpaht,content_xpath,allow_redirects
        rooturl = r''#章节url如果是相对路径需要在此添加根路径
        cata_url = f'http://book.zongheng.com/showchapter/{bookid}.html'                    #目录页网址
        title_xpath='/html/body/div[3]/div[2]/div[2]/div/ul/li/a/text()'         #章节文字xpath
        url_xpath=  '/html/body/div[3]/div[2]/div[2]/div/ul/li/a/@href'            #章节url的xpath
        filename_xpaht='/html/body/div[3]/div[1]/h1/text()'                         #文件名,即小说名字的xpath
        content_xpath='//*[@id="readerFt"]/div/div[5]//text()'                      #内容页文本的xpath
    if site_choice=="17k":
        # global rooturl,cata_url,title_xpath,url_xpath,filename_xpaht,content_xpath
        rooturl = r'http://www.17k.com'#章节url如果是相对路径需要在此添加根路径
        cata_url = f'http://www.17k.com/list/{bookid}.html'                    #目录页网址
        title_xpath='/html/body/div[5]/dl/dd/a/span/text()'         #章节文字xpath
        url_xpath=  '/html/body/div[5]/dl/dd/a/@href'            #章节url的xpath
        filename_xpaht='/html/body/div[5]/h1/text()'                         #文件名,即小说名字的xpath
        content_xpath='/html/body/div[4]/div[2]/div[2]/div[1]/div[2]/p//text()'                      #内容页文本的xpath
    if site_choice=="起点":
        # global rooturl,cata_url,title_xpath,url_xpath,filename_xpaht,content_xpath
        allow_redirects=True
        rooturl = r'http:'#章节url如果是相对路径需要在此添加根路径
        cata_url = f'http://book.qidian.com/info/{bookid}/#Catalog'                    #目录页网址
        title_xpath='//*[@id="j-catalogWrap"]/div[2]/div[1]/ul/li/a/text()'         #章节文字xpath
        url_xpath=  '//*[@id="j-catalogWrap"]/div[2]/div[1]/ul/li/a/@href'            #章节url的xpath
        filename_xpaht='/html/body/div[1]/div[6]/div[1]/div[2]/h1/em/text()'                         #文件名,即小说名字的xpath
        content_xpath='//div[1]/div/div/div[2]/p/text()'                      #内容页文本的xpath
                     # /html/body/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[2]/p/span[1]
if __name__ == "__main__":
    site_list = ["纵横","17k","起点"]
    for i in range(len(site_list)):
        print("\t",i,"\t",site_list[i])
    site=input("请选择网站：")
    site=site_list[int(site)]
    bookid=input("请输入文章ID:")
    config(site,str(bookid))
    path_dir = ".\mytemp"                                                       #存放小说文件的路径
    title_url_dict,filename = get_title_titleurl(cata_url,title_xpath,url_xpath,filename_xpaht)
    print("\t小说或文章标题:《"+filename+"》")
    for title in title_url_dict:
        target_url = rooturl+title_url_dict[title]#内容页网址
        content = get_file_content(target_url,content_xpath)
        write_to_file(path_dir,filename,"*"*100+"\n"+title+"\n"+(content)+"\n",chapter=title,mode="a",_fileurl=target_url)
        sleep(1)
    print("程序运行结束!!!")

帐号		自动登录	找回密码
密码			注册[Register]

[Python 转载] 通用在线小说阅读爬虫

免费评分