女朋友要看小说,但是找不到现成的资源,就想着用python爬一下。
源代码如下,代{过}{滤}理是抄的论坛一个老哥的(@fengmodel)
因为急着想把小说爬下来,没加异常处理什么的,跑下来发现一点问题,就是程序跑到一半就会像假死一样不动了,也不报错,也不停止,但是我手动停止下再继续跑就又可以了,虽然两次是把所有的章节爬了下来,但是想着把问题找出来,请论坛的大佬们帮忙看看~
[Python] 纯文本查看 复制代码 import random
import requests
import time
from bs4 import BeautifulSoup
def UserAgent_random():
user_agent_list = [
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36',
'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 '
'Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 '
'Safari/537.36',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36',
'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 '
'Safari/537.36']
UserAgent = {'User-Agent': random.choice(user_agent_list)}
return UserAgent
def next_page(soup):
pager = soup.find(name='div', attrs={'class': 'pager'})
for a in pager.findAll(name='a'):
if a.string == '下一章':
return str(a['href'])
def download_page(soup):
head = '【' + str(soup.h1.string) + '】' + '\n' # 章节名
paragraph.append(head)
content_text = soup.find(name='div', attrs={'class': 'content'})
for i in content_text.findAll(name='p'):
paragraph.append(str(i.string) + '\n')
paragraph.append('\n\n\n\n')
if __name__ == '__main__':
url = 'https://m.gulongsw.com'
url_r = '/xs_968/938982.html'
# final_url = '/xs_968/1008623.html'
while url_r != '/xs_968/':
paragraph = []
UserAgent = UserAgent_random()
real_html = requests.get(url + url_r, headers=UserAgent).text
soup = BeautifulSoup(real_html, 'html.parser')
download_page(soup)
url_r = next_page(soup)
print('loading' + paragraph[-5])
time.sleep(5)
with open('novel.txt', 'a', encoding='utf-8') as f:
for p in paragraph:
f.write(p)
f.close()
|