爬取盗版电子书

iunklinkm 发表于 2023-8-8 15:17

本帖最后由 iunklinkm 于 2023-8-8 15:24 编辑

之前写了一个脚本爬取盗版电子书，但是效率太慢，只能一本一本爬，想请教一下各位大神该怎么优化可以同时爬取多本。以下是我的脚本：

```
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# time: 2023/7/25 15:22
# file: xbiquge.py
# author: lyx

import requests, re, random, sys,io
from bs4 import BeautifulSoup
from time import sleep

def main():
namelist = ['阵问长生','神明模拟器','半岛检查官','全民逃荒，我的物品能升级','玄鉴仙族']
for i in namelist:
   bookname = i
   print(f"爬取书籍： {bookname}!")
   url = 'https://www.ibiquges.info/modules/article/waps.php'
   data = {'searchkey': bookname}
   res = requests.get(url=url, params=data)
   res.encoding = 'utf-8'
   soup = BeautifulSoup(res.text, 'html.parser')
   content = soup.find('a', string=bookname)

   if content:
         url1 = 'https://www.ibiquges.info'
         url = content['href']
         content = requests.get(url=url)
         content.encoding = 'utf-8'
         catalog0 = BeautifulSoup(content.text, 'html.parser')
         catalog1 = catalog0.find('div', id='list')
         catalog2 = str(catalog1)
         pattern = r'<dd><a href="(.*?)">(.*?)</a></dd>'
         matches = re.findall(pattern, catalog2)
         result = [(match, match) for match in matches]
         with open(bookname + '.txt', 'a', encoding='utf-8') as file:
            for j in result:
               for k in range(20):
                     url2 = url1 + str(j)
                     chapter = requests.get(url2)
                     chapter.encoding = 'utf-8'
                     chapter1 = BeautifulSoup(chapter.text, 'html.parser')
                     chapter_div = chapter1.find('div', {'id': 'content'})
                     if chapter_div:
                        paragraphs = chapter_div.find_all('p')
                        for p in paragraphs:
                           p.decompose()
                        div = chapter_div.find_all('div')
                        for d in div:
                           d.decompose()
                     if chapter_div is not None:
                        middle_text = chapter_div.get_text("\n", strip=True)
                        # print(middle_text)
                        print('\n\n\n' + j + '\n\n\n')
                        file.write('\n\n\n' + j + '\n\n\n')
                        file.write(middle_text)
                        break
                     else:
                        continue
   else:
         print("未找到匹配的URL")

if __name__ == "__main__":
main()

```

momo2436 发表于 2023-8-8 16:29

能爬完整就好，不要爬一章就开头。

rwj1990 发表于 2023-8-8 15:54

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# time: 2023/7/25 15:22
# file: xbiquge.py
# author: lyx

import requests
import re
import random
import sys
import io
from bs4 import BeautifulSoup
from time import sleep
from concurrent.futures import ThreadPoolExecutor

def crawl_book(bookname):
print(f"爬取书籍： {bookname}!")
url = 'https://www.ibiquges.info/modules/article/waps.php'
data = {'searchkey': bookname}
res = requests.get(url=url, params=data)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
content = soup.find('a', string=bookname)

if content:
   url1 = 'https://www.ibiquges.info'
   url = content['href']
   content = requests.get(url=url)
   content.encoding = 'utf-8'
   catalog0 = BeautifulSoup(content.text, 'html.parser')
   catalog1 = catalog0.find('div', id='list')
   catalog2 = str(catalog1)
   pattern = r'<dd><a href="(.*?)">(.*?)</a></dd>'
   matches = re.findall(pattern, catalog2)
   result = [(match, match) for match in matches]
   with open(bookname + '.txt', 'a', encoding='utf-8') as file:
         for j in result:
            for k in range(20):
               url2 = url1 + str(j)
               chapter = requests.get(url2)
               chapter.encoding = 'utf-8'
               chapter1 = BeautifulSoup(chapter.text, 'html.parser')
               chapter_div = chapter1.find('div', {'id': 'content'})
               if chapter_div:
                     paragraphs = chapter_div.find_all('p')
                     for p in paragraphs:
                        p.decompose()
                     div = chapter_div.find_all('div')
                     for d in div:
                        d.decompose()
               if chapter_div is not None:
                     middle_text = chapter_div.get_text("\n", strip=True)
                     print('\n\n\n' + j + '\n\n\n')
                     file.write('\n\n\n' + j + '\n\n\n')
                     file.write(middle_text)
                     break
               else:
                     continue
else:
   print("未找到匹配的URL")

def main():
namelist = ['阵问长生', '神明模拟器', '半岛检查官', '全民逃荒，我的物品能升级', '玄鉴仙族']
with ThreadPoolExecutor(max_workers=len(namelist)) as executor:
   executor.map(crawl_book, namelist)

if __name__ == "__main__":
main()

rwj1990 发表于 2023-8-8 15:53

多线程（concurrent.futures.ThreadPoolExecutor

iunklinkm 发表于 2023-8-8 15:59

rwj1990 发表于 2023-8-8 15:54
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# time: 2023/7/25 15:22

感谢，我试试

zhuxiangyu1024 发表于 2023-8-8 16:08

上协程换掉BeautifulSoup用xpath和正则，论坛搜一下随便抄个代码大体逻辑基本都差不多，不用考虑多本一起爬更不需要考虑多线程这种东西，一本爬的足够快就行了，电子书才多大。

Shadow1005 发表于 2023-8-8 16:37

謝謝分享

GaryZong 发表于 2023-8-8 16:37

謝謝分享

orb001 发表于 2023-8-8 16:58

谢谢分享

njcdh 发表于 2023-8-8 17:19

谢谢分享

页: [1] 2 3 4 5

吾爱破解 - 52pojie.cn's Archiver

爬取盗版电子书