爬取盗版电子书
本帖最后由 iunklinkm 于 2023-8-8 15:24 编辑之前写了一个脚本爬取盗版电子书,但是效率太慢,只能一本一本爬,想请教一下各位大神该怎么优化可以同时爬取多本。以下是我的脚本:
```
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# time: 2023/7/25 15:22
# file: xbiquge.py
# author: lyx
import requests, re, random, sys,io
from bs4 import BeautifulSoup
from time import sleep
def main():
namelist = ['阵问长生','神明模拟器','半岛检查官','全民逃荒,我的物品能升级','玄鉴仙族']
for i in namelist:
bookname = i
print(f"爬取书籍: {bookname}!")
url = 'https://www.ibiquges.info/modules/article/waps.php'
data = {'searchkey': bookname}
res = requests.get(url=url, params=data)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
content = soup.find('a', string=bookname)
if content:
url1 = 'https://www.ibiquges.info'
url = content['href']
content = requests.get(url=url)
content.encoding = 'utf-8'
catalog0 = BeautifulSoup(content.text, 'html.parser')
catalog1 = catalog0.find('div', id='list')
catalog2 = str(catalog1)
pattern = r'<dd><a href="(.*?)">(.*?)</a></dd>'
matches = re.findall(pattern, catalog2)
result = [(match, match) for match in matches]
with open(bookname + '.txt', 'a', encoding='utf-8') as file:
for j in result:
for k in range(20):
url2 = url1 + str(j)
chapter = requests.get(url2)
chapter.encoding = 'utf-8'
chapter1 = BeautifulSoup(chapter.text, 'html.parser')
chapter_div = chapter1.find('div', {'id': 'content'})
if chapter_div:
paragraphs = chapter_div.find_all('p')
for p in paragraphs:
p.decompose()
div = chapter_div.find_all('div')
for d in div:
d.decompose()
if chapter_div is not None:
middle_text = chapter_div.get_text("\n", strip=True)
# print(middle_text)
print('\n\n\n' + j + '\n\n\n')
file.write('\n\n\n' + j + '\n\n\n')
file.write(middle_text)
break
else:
continue
else:
print("未找到匹配的URL")
if __name__ == "__main__":
main()
``` 能爬完整就好,不要爬一章就开头。 #!/usr/bin/env python
# -*- coding: utf-8 -*-
# time: 2023/7/25 15:22
# file: xbiquge.py
# author: lyx
import requests
import re
import random
import sys
import io
from bs4 import BeautifulSoup
from time import sleep
from concurrent.futures import ThreadPoolExecutor
def crawl_book(bookname):
print(f"爬取书籍: {bookname}!")
url = 'https://www.ibiquges.info/modules/article/waps.php'
data = {'searchkey': bookname}
res = requests.get(url=url, params=data)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
content = soup.find('a', string=bookname)
if content:
url1 = 'https://www.ibiquges.info'
url = content['href']
content = requests.get(url=url)
content.encoding = 'utf-8'
catalog0 = BeautifulSoup(content.text, 'html.parser')
catalog1 = catalog0.find('div', id='list')
catalog2 = str(catalog1)
pattern = r'<dd><a href="(.*?)">(.*?)</a></dd>'
matches = re.findall(pattern, catalog2)
result = [(match, match) for match in matches]
with open(bookname + '.txt', 'a', encoding='utf-8') as file:
for j in result:
for k in range(20):
url2 = url1 + str(j)
chapter = requests.get(url2)
chapter.encoding = 'utf-8'
chapter1 = BeautifulSoup(chapter.text, 'html.parser')
chapter_div = chapter1.find('div', {'id': 'content'})
if chapter_div:
paragraphs = chapter_div.find_all('p')
for p in paragraphs:
p.decompose()
div = chapter_div.find_all('div')
for d in div:
d.decompose()
if chapter_div is not None:
middle_text = chapter_div.get_text("\n", strip=True)
print('\n\n\n' + j + '\n\n\n')
file.write('\n\n\n' + j + '\n\n\n')
file.write(middle_text)
break
else:
continue
else:
print("未找到匹配的URL")
def main():
namelist = ['阵问长生', '神明模拟器', '半岛检查官', '全民逃荒,我的物品能升级', '玄鉴仙族']
with ThreadPoolExecutor(max_workers=len(namelist)) as executor:
executor.map(crawl_book, namelist)
if __name__ == "__main__":
main() 多线程(concurrent.futures.ThreadPoolExecutor rwj1990 发表于 2023-8-8 15:54
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# time: 2023/7/25 15:22
感谢,我试试 上协程换掉BeautifulSoup用xpath和正则,论坛搜一下随便抄个代码大体逻辑基本都差不多,不用考虑多本一起爬更不需要考虑多线程这种东西,一本爬的足够快就行了,电子书才多大。 謝謝分享 謝謝分享 谢谢分享 谢谢分享