本帖最后由 xuyanhenry 于 2024-9-16 19:16 编辑
新人第一次发帖,如有不对欢迎指出,大佬勿喷
由于番茄小说本来就是免费的所以拿来练手
需要输入番茄网页端对应网址后面一串的id。例:https://fanqienovel.com/page/6982529841564224526后面的6982529841564224526来进行下载。
可能因为过于频繁的访问导致短时间的访问的报错
引用:
1.由于番茄对于网页版的限制导致后面大部分章节无法全部观看因此参考油猴插件https://greasyfork.org/zh-CN/scripts/486817-%E7%95%AA%E8%8C%84%E5%B0%8F%E8%AF%B4%E7%BD%91%E9%A1%B5%E7%89%88-%E5%85%8D%E8%B4%B9%E9%98%85%E8%AF%BB-%E6%94%AF%E6%8C%81%E8%A7%A3%E9%94%81vip
并对代码进行大部分的引用
2.由于番茄字体有加密,参考https://blog.csdn.net/SM_zeng/article/details/140564469
此文章对字体进行解析,没有仔细检查可能有部分出错
源码:<blockquote>import requests
import json
import re
from parsel import Selector
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from lxml import etree
from tqdm import tqdm
import time
dic_data = {
"58344": "d", "58345": "在", "58346": "干", "58347": "特", "58348": "家", "58349": "军", "58350": "然",
"58351": "表", "58352": "场", "58353": "4", "58354": "要", "58355": "只", "58356": "v", "58357": "和",
"58359": "6", "58360": "别", "58361": "还", "58362": "g", "58363": "现", "58364": "儿", "58365": "岁",
"58368": "此", "58369": "象", "58370": "月", "58371": "3", "58372": "出", "58373": "战", "58374": "工",
"58375": "相", "58376": "。", "58377": "男", "58378": "直", "58379": "失", "58380": "世", "58381": "F",
"58382": "都", "58383": "平", "58384": "文", "58385": "什", "58386": "V", "58387": "o", "58388": "将",
"58389": "真", "58390": "工", "58391": "那", "58392": "当", "58394": "会", "58395": "立", "58396": "此",
"58397": "山", "58398": "是", "58399": "十", "58400": "张", "58401": "学", "58402": "气", "58403": "大",
"58404": "爱", "58405": "两", "58406": "命", "58407": "全", "58408": "后", "58409": "东", "58410": "性",
"58411": "通", "58412": "被", "58413": "1", "58414": "它", "58415": "乐", "58416": "接", "58417": "而",
"58418": "感", "58419": "车", "58420": "山", "58421": "公", "58422": "了", "58423": "常", "58424": "以",
"58425": "何", "58426": "可", "58427": "话", "58428": "先", "58429": "p", "58430": "j", "58431": "叫",
"58432": "轻", "58433": "m", "58434": "十", "58435": "以", "58436": "着", "58437": "变", "58438": "尔",
"58439": "快", "58440": "上", "58441": "个", "58442": "说", "58443": "小", "58444": "色", "58445": "里",
"58446": "安", "58447": "花", "58448": "远", "58449": "7", "58450": "难", "58451": "师", "58452": "放",
"58453": "十", "58454": "报", "58455": "认", "58456": "亩", "58457": "道", "58458": "s", "58460": "克",
"58461": "地", "58462": "度", "58463": "上", "58464": "好", "58465": "机", "58466": "u", "58467": "民",
"58468": "写", "58469": "把", "58470": "万", "58471": "同", "58472": "水", "58473": "新", "58474": "没",
"58475": "书", "58476": "申", "58477": "吃", "58478": "像", "58479": "斯", "58480": "5", "58481": "为",
"58482": "v", "58483": "白", "58484": "几", "58485": "日", "58486": "教", "58487": "看", "58488": "但",
"58489": "第", "58490": "加", "58491": "候", "58492": "作", "58493": "上", "58494": "拉", "58495": "住",
"58496": "有", "58497": "法", "58498": "r", "58499": "事", "58500": "应", "58501": "位", "58502": "利",
"58503": "你", "58504": "声", "58505": "身", "58506": "国", "58507": "问", "58508": "马", "58509": "女",
"58510": "他", "58511": "y", "58512": "比", "58513": "父", "58514": "", "58515": "a", "58516": "h",
"58517": "n", "58518": "c", "58519": "x", "58520": "边", "58521": "美", "58522": "对", "58523": "所",
"58524": "金", "58525": "活", "58526": "回", "58527": "意", "58528": "到", "58529": "之", "58530": "从",
"58531": "j", "58532": "知", "58533": "又", "58534": "内", "58535": "冈", "58536": "点", "58537": "o",
"58538": "一", "58539": "定", "58540": "8", "58541": "r", "58542": "b", "58543": "正", "58544": "或",
"58545": "夫", "58546": "向", "58547": "德", "58548": "听", "58549": "更", "58551": "得", "58552": "告",
"58553": "并", "58554": "本", "58555": "q", "58556": "过", "58557": "记", "58558": "上", "58559": "让",
"58560": "打", "58561": "f", "58562": "人", "58563": "就", "58564": "者", "58565": "去", "58566": "原",
"58567": "满", "58568": "体", "58569": "做", "58570": "经", "58571": "k", "58572": "走", "58573": "如",
"58574": "孩", "58575": "c", "58576": "g", "58577": "给", "58578": "使", "58579": "物", "58581": "最",
"58582": "笑", "58583": "部", "58585": "员", "58586": "等", "58587": "受", "58588": "k", "58589": "行",
"58590": "", "58591": "条", "58592": "果", "58593": "动", "58594": "光", "58595": "门", "58596": "头",
"58597": "见", "58598": "往", "58599": "白", "58600": "解", "58601": "成", "58602": "外", "58603": "天",
"58604": "能", "58605": "干", "58606": "名", "58607": "其", "58608": "发", "58609": "总", "58610": "母",
"58611": "的", "58612": "死", "58613": "手", "58614": "入", "58615": "路", "58616": "进", "58617": "心",
"58618": "来", "58619": "h", "58620": "时", "58621": "力", "58622": "多", "58623": "开", "58624": "已",
"58625": "许", "58626": "d", "58627": "至", "58628": "由", "58629": "很", "58630": "界", "58631": "n",
"58632": "小", "58633": "与", "58634": "之", "58635": "想", "58636": "代", "58637": "么", "58638": "分",
"58639": "牛", "58640": "口", "58641": "再", "58642": "妈", "58643": "望", "58644": "次", "58645": "西",
"58646": "风", "58647": "种", "58648": "带", "58649": "J", "58651": "实", "58652": "情", "58653": "才",
"58654": "这", "58656": "F", "58657": "我", "58658": "神", "58659": "格", "58660": "长", "58661": "觉",
"58662": "间", "58663": "年", "58664": "眼", "58665": "无", "58666": "不", "58667": "亲", "58668": "关",
"58669": "结", "58670": "0", "58671": "友", "58672": "信", "58673": "下", "58674": "却", "58675": "重",
"58676": "已", "58677": "老", "58678": "2", "58679": "音", "58680": "字", "58681": "m", "58682": "呢",
"58683": "明", "58684": "之", "58685": "前", "58686": "高", "58687": "p", "58688": "b", "58689": "目",
"58690": "太", "58691": "。", "58692": "9", "58693": "起", "58694": "穆", "58695": "她", "58696": "也",
"58697": "w", "58698": "用", "58699": "方", "58700": "子", "58701": "英", "58702": "每", "58703": "理",
"58704": "便", "58705": "四", "58706": "数", "58707": "期", "58708": "中", "58709": "c", "58710": "外",
"58711": "样", "58712": "a", "58713": "海", "58714": "们", "58715": "任"
}
def get_text_from_xpath(book_id, css):
url = f"https://fanqienovel.com/page/{book_id}"
response = requests.get(url)
if response.status_code == 200:
selector = Selector(response.text)
elements = selector.css(css).get()
if elements:
return elements
return None
def get_content_from_fanqienovel(item_id, retries=5):
url = f"https://fanqienovel.com/api/reader/full?itemId={item_id}"
headers = {
"Content-Type": "application/json",
"Accept": "application/json, text/plain, */*",
"Cookie": "novel_web_id=7357767624615331362;"
}
for attempt in range(retries):
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.json().get('data', {}).get('chapterData', {}).get('content', '无法解析')
except requests.exceptions.RequestException as e:
print(f"Attempt {attempt + 1} failed: {e}")
time.sleep(2) # 等待2秒后重试
return "无法解析"
def get_content_from_dushuge(book_name, chapter_name):
search_url = f"http://www.dushuge.com/hsdgiohsdigohsog.php?ie=gbk&q={book_name}"
response = requests.get(search_url)
if response.status_code != 200:
return "无法解析"
soup = BeautifulSoup(response.content, 'html.parser')
book_href = ""
for a in soup.find_all('a', href=True):
if book_name in a.text:
book_href = a['href']
break
if not book_href:
return "无法解析"
book_url = f"http://www.dushuge.com{book_href}"
response = requests.get(book_url)
if response.status_code != 200:
return "无法解析"
soup = BeautifulSoup(response.content, 'html.parser')
chapter_href = ""
for dd in soup.find_all('dd'):
a = dd.find('a')
if a and chapter_name in a.text:
chapter_href = a['href']
break
if not chapter_href:
return "无法解析"
chapter_url = f"http://www.dushuge.com{chapter_href}"
response = requests.get(chapter_url)
if response.status_code != 200:
return "无法解析"
soup = BeautifulSoup(response.content, 'html.parser')
content_div = soup.find('div', id='content')
if content_div:
return content_div.get_text(separator="\n").replace(" ", "").strip()
return "无法解析"
def remove_p_tags(content):
content = re.sub(r'<p>', '', content)
content = re.sub(r'</p>', '\n\n', content)
return content
def fetch_content(item):
item_id = item['ID']
chapter_name = item['title']
content = get_content_from_fanqienovel(item_id)
if content == "无法解析":
content = get_content_from_dushuge(book_name, chapter_name)
if content != "无法解析":
content = remove_p_tags(content)
processed_content = f"{chapter_name}\n\n"
for index in content:
try:
word = dic_data[str(ord(index))]
processed_content += word
except:
processed_content += index
processed_content += '\n\n'
return item, processed_content
else:
return item, f"Failed to fetch content for {chapter_name}.\n"
def download_novels(item_id_list, book_name, output_file):
total_chapters = len(item_id_list)
with open(output_file, 'w', encoding='utf-8') as file, tqdm(total=total_chapters, desc="Progress", unit="chapter") as pbar:
all_contents = []
with ThreadPoolExecutor(max_workers=5) as executor: #可以更改线程数
futures = {executor.submit(fetch_content, item): item for item in item_id_list}
for future in as_completed(futures):
item, chapter_content = future.result()
index = item_id_list.index(item)
all_contents.append((index, chapter_content))
pbar.update(1)
if len(all_contents) >= 20: # 每20章保存一次
all_contents.sort(key=lambda x: x[0])
for _, content in all_contents:
file.write(content)
all_contents.clear()
# 保存剩余内容
if all_contents:
all_contents.sort(key=lambda x: x[0])
for _, content in all_contents:
file.write(content)
def get_chapter_list(book_id):
url = f"https://fanqienovel.com/api/reader/directory/detail?bookId={book_id}"
response = requests.get(url)
if response.status_code == 200:
data = response.json()
chapter_list_with_volume = data.get('data', {}).get('chapterListWithVolume', [])
item_id_list = []
for volume in chapter_list_with_volume:
for chapter in volume:
item_id = chapter.get('itemId')
title = chapter.get('title')
if item_id and title:
item_id_list.append({"ID": item_id, "title": title})
return item_id_list
else:
print("Failed to fetch data")
return []
# Main execution
while True:
book_id = input("Please enter the book ID: ")
css = '.page-header-info .info-name h1::text'
item_id_list = get_chapter_list(book_id)
text_content = get_text_from_xpath(book_id, css)
if text_content:
book_name = text_content.strip()
print("book name:",book_name)
else:
book_name = "Unknown_Book"
output_file = f"E:/爬虫/{book_name}.txt" #要改
download_novels(item_id_list, book_name, output_file)
print(f"All chapters saved to {output_file}")
</blockquote>
这代码的爬取部分由于是参考油猴脚本的按其逻辑不是从番茄的的数据库拿到的数据,还是通过别的网站的资源。
最近新看到大佬的程序
https://www.52pojie.cn/thread-1950372-1-1.html
发现cookie中改变novel_web_id来直接得到所有内容。估计番茄是通过判断novel_web_id来发送数据的,因此增加了部分代码通过爆破的方式来获得成功的id(这样就可以保证只要番茄这部分判断代码不变就能够成功获取资源)换掉了原来通过别的源的方式来获取资源
<blockquote>import requests
import json
import re
from parsel import Selector
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from lxml import etree
from tqdm import tqdm
import time
id = 7357767624615331361
dic_data = {
"58344": "d", "58345": "在", "58346": "干", "58347": "特", "58348": "家", "58349": "军", "58350": "然",
"58351": "表", "58352": "场", "58353": "4", "58354": "要", "58355": "只", "58356": "v", "58357": "和",
"58359": "6", "58360": "别", "58361": "还", "58362": "g", "58363": "现", "58364": "儿", "58365": "岁",
"58368": "此", "58369": "象", "58370": "月", "58371": "3", "58372": "出", "58373": "战", "58374": "工",
"58375": "相", "58376": "。", "58377": "男", "58378": "直", "58379": "失", "58380": "世", "58381": "F",
"58382": "都", "58383": "平", "58384": "文", "58385": "什", "58386": "V", "58387": "o", "58388": "将",
"58389": "真", "58390": "工", "58391": "那", "58392": "当", "58394": "会", "58395": "立", "58396": "此",
"58397": "山", "58398": "是", "58399": "十", "58400": "张", "58401": "学", "58402": "气", "58403": "大",
"58404": "爱", "58405": "两", "58406": "命", "58407": "全", "58408": "后", "58409": "东", "58410": "性",
"58411": "通", "58412": "被", "58413": "1", "58414": "它", "58415": "乐", "58416": "接", "58417": "而",
"58418": "感", "58419": "车", "58420": "山", "58421": "公", "58422": "了", "58423": "常", "58424": "以",
"58425": "何", "58426": "可", "58427": "话", "58428": "先", "58429": "p", "58430": "j", "58431": "叫",
"58432": "轻", "58433": "m", "58434": "十", "58435": "以", "58436": "着", "58437": "变", "58438": "尔",
"58439": "快", "58440": "上", "58441": "个", "58442": "说", "58443": "小", "58444": "色", "58445": "里",
"58446": "安", "58447": "花", "58448": "远", "58449": "7", "58450": "难", "58451": "师", "58452": "放",
"58453": "十", "58454": "报", "58455": "认", "58456": "亩", "58457": "道", "58458": "s", "58460": "克",
"58461": "地", "58462": "度", "58463": "上", "58464": "好", "58465": "机", "58466": "u", "58467": "民",
"58468": "写", "58469": "把", "58470": "万", "58471": "同", "58472": "水", "58473": "新", "58474": "没",
"58475": "书", "58476": "申", "58477": "吃", "58478": "像", "58479": "斯", "58480": "5", "58481": "为",
"58482": "v", "58483": "白", "58484": "几", "58485": "日", "58486": "教", "58487": "看", "58488": "但",
"58489": "第", "58490": "加", "58491": "候", "58492": "作", "58493": "上", "58494": "拉", "58495": "住",
"58496": "有", "58497": "法", "58498": "r", "58499": "事", "58500": "应", "58501": "位", "58502": "利",
"58503": "你", "58504": "声", "58505": "身", "58506": "国", "58507": "问", "58508": "马", "58509": "女",
"58510": "他", "58511": "y", "58512": "比", "58513": "父", "58514": "", "58515": "a", "58516": "h",
"58517": "n", "58518": "c", "58519": "x", "58520": "边", "58521": "美", "58522": "对", "58523": "所",
"58524": "金", "58525": "活", "58526": "回", "58527": "意", "58528": "到", "58529": "之", "58530": "从",
"58531": "j", "58532": "知", "58533": "又", "58534": "内", "58535": "冈", "58536": "点", "58537": "o",
"58538": "一", "58539": "定", "58540": "8", "58541": "r", "58542": "b", "58543": "正", "58544": "或",
"58545": "夫", "58546": "向", "58547": "德", "58548": "听", "58549": "更", "58551": "得", "58552": "告",
"58553": "并", "58554": "本", "58555": "q", "58556": "过", "58557": "记", "58558": "上", "58559": "让",
"58560": "打", "58561": "f", "58562": "人", "58563": "就", "58564": "者", "58565": "去", "58566": "原",
"58567": "满", "58568": "体", "58569": "做", "58570": "经", "58571": "k", "58572": "走", "58573": "如",
"58574": "孩", "58575": "c", "58576": "g", "58577": "给", "58578": "使", "58579": "物", "58581": "最",
"58582": "笑", "58583": "部", "58585": "员", "58586": "等", "58587": "受", "58588": "k", "58589": "行",
"58590": "", "58591": "条", "58592": "果", "58593": "动", "58594": "光", "58595": "门", "58596": "头",
"58597": "见", "58598": "往", "58599": "白", "58600": "解", "58601": "成", "58602": "外", "58603": "天",
"58604": "能", "58605": "干", "58606": "名", "58607": "其", "58608": "发", "58609": "总", "58610": "母",
"58611": "的", "58612": "死", "58613": "手", "58614": "入", "58615": "路", "58616": "进", "58617": "心",
"58618": "来", "58619": "h", "58620": "时", "58621": "力", "58622": "多", "58623": "开", "58624": "已",
"58625": "许", "58626": "d", "58627": "至", "58628": "由", "58629": "很", "58630": "界", "58631": "n",
"58632": "小", "58633": "与", "58634": "之", "58635": "想", "58636": "代", "58637": "么", "58638": "分",
"58639": "牛", "58640": "口", "58641": "再", "58642": "妈", "58643": "望", "58644": "次", "58645": "西",
"58646": "风", "58647": "种", "58648": "带", "58649": "J", "58651": "实", "58652": "情", "58653": "才",
"58654": "这", "58656": "F", "58657": "我", "58658": "神", "58659": "格", "58660": "长", "58661": "觉",
"58662": "间", "58663": "年", "58664": "眼", "58665": "无", "58666": "不", "58667": "亲", "58668": "关",
"58669": "结", "58670": "0", "58671": "友", "58672": "信", "58673": "下", "58674": "却", "58675": "重",
"58676": "已", "58677": "老", "58678": "2", "58679": "音", "58680": "字", "58681": "m", "58682": "呢",
"58683": "明", "58684": "之", "58685": "前", "58686": "高", "58687": "p", "58688": "b", "58689": "目",
"58690": "太", "58691": "。", "58692": "9", "58693": "起", "58694": "穆", "58695": "她", "58696": "也",
"58697": "w", "58698": "用", "58699": "方", "58700": "子", "58701": "英", "58702": "每", "58703": "理",
"58704": "便", "58705": "四", "58706": "数", "58707": "期", "58708": "中", "58709": "c", "58710": "外",
"58711": "样", "58712": "a", "58713": "海", "58714": "们", "58715": "任"
}
def get_text_from_xpath(book_id, css):
url = f"https://fanqienovel.com/page/{book_id}"
response = requests.get(url)
if response.status_code == 200:
selector = Selector(response.text)
elements = selector.css(css).get() #书名
if elements:
print("elements",elements)
return elements
return None
def change_userid():
global id
testid = 7101675619141976584
useid=id
while True:
test_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE',
'cookie' : f'novel_web_id={useid}'
}
try:
# 发送请求并处理数据
res = requests.get(f'https://fanqienovel.com/api/reader/full?itemId={testid}', headers=test_headers)
data = json.loads(res.text)['data']
content = data['chapterData']['content']
# print(content)
if len(content) > 1000:
print("成功的id", useid)
id=useid
return
except Exception as e:
print(f"请求失败,ID={useid}, 错误: {e}")
time.sleep(1) # 等待1秒再重试
continue
useid += 1
def get_context_from_fanqie(item_id,retries=5):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE',
'cookie':f'novel_web_id={id}'
}
try:
# 发送请求并处理数据
while True:
response = requests.get(f'https://fanqienovel.com/api/reader/full?itemId={item_id}', headers=headers)
data = json.loads(response.text)['data']
if 'chapterData' in data:
break
elif retries>0:
retries=retries-1
time.sleep(1)
elif retries<=0:
break
content = data['chapterData']['content']
return content
except Exception as e:
return "无法解析"
def remove_p_tags(content):
content = re.sub(r'<p>', '', content)
content = re.sub(r'</p>', '\n\n', content)
return content
def fetch_content(item):
item_id = item['ID']
chapter_name = item['title']
content=get_context_from_fanqie(item_id)
if content != "无法解析":
content = remove_p_tags(content)
processed_content = f"{chapter_name}\n\n"
for index in content:
try:
word = dic_data[str(ord(index))]
processed_content += word
except:
processed_content += index
processed_content += '\n\n'
return item, processed_content
else:
return item, f"Failed to fetch content for {chapter_name}.\n"
def download_novels(item_id_list, book_name, output_file):
total_chapters = len(item_id_list)
with open(output_file, 'w', encoding='utf-8') as file, tqdm(total=total_chapters, desc="Progress", unit="chapter") as pbar:
all_contents = []
with ThreadPoolExecutor(max_workers=5) as executor: #可以更改线程数
futures = {executor.submit(fetch_content, item): item for item in item_id_list}
for future in as_completed(futures):
item, chapter_content = future.result()
index = item_id_list.index(item)
all_contents.append((index, chapter_content))
pbar.update(1)
if len(all_contents) >= 20: # 每20章保存一次
all_contents.sort(key=lambda x: x[0])
for _, content in all_contents:
file.write(content)
all_contents.clear()
# 保存剩余内容
if all_contents:
all_contents.sort(key=lambda x: x[0])
for _, content in all_contents:
file.write(content)
def get_chapter_list(book_id):
url = f"https://fanqienovel.com/api/reader/directory/detail?bookId={book_id}"
response = requests.get(url)
if response.status_code == 200:
data = response.json()
chapter_list_with_volume = data.get('data', {}).get('chapterListWithVolume', [])
item_id_list = []
for volume in chapter_list_with_volume:
for chapter in volume:
item_id = chapter.get('itemId')
title = chapter.get('title')
if item_id and title:
item_id_list.append({"ID": item_id, "title": title})
return item_id_list
else:
print("Failed to fetch data")
return []
# Main execution
while True:
change_userid()
book_id = input("Please enter the book ID: ")
css = '.page-header-info .info-name h1::text'
item_id_list = get_chapter_list(book_id)
text_content = get_text_from_xpath(book_id, css)
if text_content:
book_name = text_content.strip()
print("book name:",book_name)
else:
book_name = "Unknown_Book"
output_file = f"E:/爬虫/{book_name}.txt" #要改
download_novels(item_id_list, book_name, output_file)
print(f"All chapters saved to {output_file}")</blockquote>
注:次逻辑暂时已经不能用了,若要下载请看本人的新文章,暂还处于维护状态
|