【原创源码】【Python】没啥卵用的爬虫
import jsonimport requests
import re
import sys
import os
import threading
class DownloadThread (threading.Thread):
def __init__(self, imgUrl, filePath):
threading.Thread.__init__(self)
self.imgUrl = imgUrl # 图片链接
self.filePath = filePath # 图片路径
def run(self):
download_file(self.imgUrl, self.filePath)
threadLock.acquire() # 获取锁,用于线程同步LoadDownList
if len(LoadDownList) == 0:
ProgressBar(len(imgList)-len(LoadDownList),
"下载完成\r\n", len(imgList), BarLen=50)
else:
Info = LoadDownList.pop(0) # 获取视频信息并删除
ProgressBar(len(imgList)-len(LoadDownList),
"下载中...", len(imgList), BarLen=50)
threadLock.release() # 释放锁,开启下一个线程
def download_file(file_url, file_path): # 下载文件
try:
exists(file_path) # 目录是否存在
file_result = requests.get(file_url) # 获取图片
with open(file_path, 'wb') as f: # 打开文件
f.write(file_result.content) # 写入图片
# print("file", file_path) # 输出信息
except IOError as e:
print(IOError)
print('下载错误', e)
return False
def ProgressBar(Clen=0, Text="", TLen=100, BarLen=100, Symbol=">", Space=" "):# 控制台进度条
# Clen 已完成进度
# TLen 总长
# BarLen 进度条长度
# Symbol 已完成进度显示符号
# Space未完成进度显示符号
Percentage = ('%.2f' % (Clen/TLen*100)).rjust(6, ' ') # 显示百分比
FinishedLen = int(BarLen*(Clen/TLen)) # 完成进度长度
BarStr = Symbol*(FinishedLen)+Space*(BarLen-FinishedLen) # 设置进度条
sys.stdout.write('\r'+BarStr + '[%s%%]%s' % (Percentage, Text))# 显示进度条
sys.stdout.flush() # 刷新显示
def exists(path): # 路径不存在则创建
try:
file_dir = os.path.split(path) # 获取路径目录
if not os.path.isdir(file_dir): # 目录是否存在
os.makedirs(file_dir) # 不存在则创建
except:
print(IOError)
print('判断路径是否存在失败')
return False
def getNewsList(Page):
url = 'https://apps.game.qq.com/cmc/cross?'
Data = {
'serviceId': '18',
'filter': 'channel',
'sortby': 'sIdxTime',
'source': 'web_pc',
'limit': '12',
'logic': 'or',
'typeids': '1',
'chanid': '1762',# 公告
'start': str(Page * 12),
'withtop': 'yes',
'exclusiveChannel': '4',
'exclusiveChannelSign': '72e3f4b6d42e9a9d42c037d1444ed4b8',
'time': '1629288543',
}
Headers = {
'Host': 'apps.game.qq.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate, br',
'Origin': 'https://pvp.qq.com',
'Connection': 'keep-alive',
'Referer': 'https://pvp.qq.com/',
'TE': 'Trailers',
}
r = requests.get(url, headers=Headers, params=Data)
# print(r)
# print(r.encoding)
# print(r.text)
if r.status_code == 200:
if r.json()['data']:
if r.json()['data']['items']:
r_json = r.json()['data']['items']
newsIndex = []
for Info in r_json:
# print(Info)
newsIndex.append(
{'Id': Info['iId'], 'Title': Info['sChannelInfoJson']['sChannelTitle']})
return newsIndex
else:
print('为空值')
else:
print('为空值')
return False
def getNewsIndex(ID):
url = 'https://apps.game.qq.com/wmp/v3.1/public/searchNews.php?'
Data = {
'p0': '18',
'source': 'web_pc',
'id': ID,
}
Headers = {
'Host': 'apps.game.qq.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Referer': 'https://pvp.qq.com/',
'Cookie': 'pgv_info=ssid=s3628213827; pgv_pvid=5567290544; pvpqqcomrouteLine=newsdetail; tokenParams=%3Ftid%3D527239; eas_sid=f1I6P2l9y298o1u8T18085t1m9',
}
r = requests.get(url, headers=Headers, params=Data)
# print(r)
# print(r.encoding)
# print(r.text)
if r.status_code == 200:
r_text = re.findall(r"var searchObj\=(.*)\;", r.text)
r_json = json.loads(r_text)
# print(r_json['msg']['sContent'])
if r_json['msg']:
if r_json['msg']['sContent']:# HTML内容
html_text = r_json['msg']['sContent']
img_url = re.findall(
r"\<img src\=\"(.*)\?width\={1,5}\&\;height\={1,5}", html_text)
# print(img_url)
return img_url
# <img src="https://shp.qpic.cn/cfwebcap/0/bec7254f482cdb97af2c0662863b4e32/0/?width=739&height=5608" alt="" width="739" height="5608" />
else:
print('为空值')
else:
print('为空值')
return False
threadLock = threading.Lock() # 线程锁
newsTop = 0
for i in range(1000):
newsList = getNewsList(i)
newsList.sort(key=lambda ele: ele['Id'], reverse=True)# 根据Id排序
# print(newsList)
if i == 0:
newsTop = newsList['Id']
elif newsTop == newsList['Id']:
break
for Info in newsList:
print(Info['Title'])
# print(Info)
imgList = getNewsIndex(Info['Id'])
# print(imgList)
if len(imgList) != 0:
threads = []
LoadDownList = list(range(1, len(imgList))) # 列表
for index, imgUrl in enumerate(imgList):
# print(imgUrl)
# 创建新线程 index
filePath = 'img/%s_%s_%s.jpg' % (
Info['Id'], Info['Title'], index)
newThread = DownloadThread(imgUrl, filePath)
newThread.start()
threads.append(newThread)
# 等待所有线程完成
for tList in threads:
tList.join()
# print('图片下载完成')
else:
print(
'>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>公告未发现图片')
谢谢你,我想请问其他网站也可以这样登录吗 kingaero 发表于 2021-8-21 23:11
谢谢你,我想请问其他网站也可以这样登录吗
没有登录 只是爬一些信息 这些代码的意义何在呢?学习 爱的天使 发表于 2021-8-22 08:51
这些代码的意义何在呢?学习
标题说明了一切 谢谢分享,了解学习参考一下 emmmm,学习学习 学习学习
页:
[1]