本帖最后由 w411024 于 2020-9-20 14:06 编辑
上次福利过后,我认识到了很大的错误,由于散播色情图片我深感抱歉,我是来分享学习的不是来教坏小朋友的
所以这次你们就别想了 我把网址删除掉了 只让你们看代码 和一个成品 需要的自己下载吧
本次是在网上看到一些小说就感觉可以爬取下来 至于什么小说你们自己脑补 总共8个分类 一个分类750条数据
emmm 先发一个小说 下午发一个图片的 仅仅只是为了学习知识 请勿用于其他用途
本软件仅提供学习用途,请勿商用以及传播,请在下载24小时内删除
教程:打开输入要保存到那个位置的路径就可以了
成品:https://www.lanzoux.com/i8rAggsafdc
[Python] 纯文本查看 复制代码 # UTF-8
# author mimang
import requests
from bs4 import BeautifulSoup
import os
def getHtml(url):
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
html = BeautifulSoup(response.text, 'lxml')
return html
def sub(strings, p, c):
new = []
for s in strings:
new.append(s)
new[p] = str(c)
return ''.join(new)
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Cookie': 'UM_distinctid=174a9141007f29-0697ac1aa46086-3971095d-19fd10-174a9141008cbb; CNZZDATA1279234273=1349150808-1600561283-%7C1600561283'
}
url = 'xxxxxxxx'
bookCount = 38
path = input('输入:')
if not(os.path.exists(path)):
os.mkdir(path)
print('路径已创建')
html = getHtml(url + '/home/index.html')
allBookUrl = html.select('#menu > ul.color > li > a')
print(len(allBookUrl)) #图书分类数量
os.chdir(path)
for a in range(1,len(allBookUrl)): #8
if not(os.path.exists(allBookUrl[a].get_text())):
os.mkdir(allBookUrl[a].get_text())
os.chdir(allBookUrl[a].get_text())
for b in range(1,bookCount+1): #38
string = allBookUrl[a].get('href')
print(sub(string,21,b))
bookHtml = getHtml(url + sub(string,21,b))
bookList = bookHtml.select('#hellobox > div.newslist.textlist > ul > li > a')
for c in range(0, len(bookList)): #20
bookTitle = bookList[c].get_text()
bookUrl = bookList[c].get('href')aa
bookHtml2 = getHtml(url+bookUrl)
bookContent = bookHtml2.select('#hellobox > div.newsbody > div.nbodys')[0].get_text()
f = open(bookTitle+'.txt','ab')
f.write(bytes(bookContent, encoding = "utf8"))
print(bookTitle+':成功')
f.close()
|