本帖最后由 ermao 于 2019-6-19 21:46 编辑
问答区看到的,正好练手了,
没有反爬,加个ua就OK了
[Python] 纯文本查看 复制代码 import requests
import json
import threading
import time
import os
import re
paper_id = []
threads = 0
glock = threading.Lock()
header = {
'User-Agent':
'DailyApi/4 (Linux; Android 5.1.1; xiaomi 6 Build/xiaomi/xiaomi 6/x86/LMY48Z/zh_CN) Google-HTTP-Java-Client/1.22.0 (gzip) Google-HTTP-Java-Client/1.22.0 (gzip)'
}
path = ''
def get_single_paper(paper_id):
global threads
glock.acquire()
threads += 1
glock.release()
url = 'https://news-at.zhihu.com/api/4/story/' + paper_id
res = requests.get(url, headers=header)
resJson = json.loads(res.content.decode('utf-8'))
try:
title = resJson['title']
body = resJson['body']
# css = resJson['css'] # css文件加了没什么效果,不加了
# allcss = ''
# for csss in css:
# allcss = allcss + '<link href="' + css + '"/>'
except TypeError:
print('json读取失败')
else:
f = open('./' + path + '/' + paper_id + '-' +
re.sub(r'[\\/:\*\?"<>\|]', '', title) + '.html',
'w',
encoding='utf-8') # 文件名为id+标题.html
f.write(body) # 各个文件独立,不需要加锁
f.close()
glock.acquire()
threads -= 1
glock.release()
return
def get_papers(id, timestamp_end):
global path
timestamp = 0
if id == '35':
path = '小事'
elif id == '2':
path = '瞎扯-吐槽'
if not os.path.exists(path):
os.makedirs(path)
while True:
if timestamp > timestamp_end:
url = 'https://news-at.zhihu.com/api/4/section/' + id + '/before/'
res = requests.get(url + str(timestamp), headers=header)
elif timestamp == 0:
url = 'https://news-at.zhihu.com/api/4/section/' + id
res = requests.get(url, headers=header)
else:
print('已到日期上限')
break
print(res.content.decode('utf-8'))
resJson = json.loads(res.content.decode('utf-8'))
timestamp = resJson['timestamp']
# n = len(resJson['stories'])
# for i in range(0, n):
for storie in resJson['stories']:
paper_id.append(storie['id'])
return len(paper_id)
def thread_control(N):
for iii in paper_id:
p_id = str(iii)
t = threading.Thread(target=get_single_paper, args=(p_id, ))
t.start()
# print('当前线程数:{:^5}'.format(threads))
while threads >= N:
time.sleep(0.2)
t.join()
print('已完成')
return
# 35 小事
# 2 瞎扯·如何正确的吐槽
# 1490536800 20170306之前的数据
# 1553608800 20190306之前的数据
nums = get_papers('2', 1490536800) # 第一个参数是文章类别id,第二个参数是时间上限
print('ID采集完成,共' + str(nums) + '个,开始下载')
thread_control(20) # 线程数
# get_single_paper('9712276')
截图:
|