本帖最后由 laogepj 于 2021-5-9 09:35 编辑
看到论坛都没有php采集爬虫我就不服了,php照样并发批量采集,一点都不耽误快速找小姐姐。
142张照片8秒搞定
1.来个PHP批量(同步)爬虫下载小姐姐图片到本地
2.里面引用的库文件请转移带这里下载https://github.com/playm3u8/whttp
3.和download.php目录同级即可
3.执行命令 php download.php
3.php版本在7.2以上
[PHP] 纯文本查看 复制代码 <?php
require_once './whttp/autoload.php';
use PL\Whttp,PL\Debug;
function getImg($id)
{
$retlist = [];
$html = whttp::get("https://bcy.net/item/detail/{$id}?_source_page=hashtag")
->core('window.__ssr_data = JSON.parse("','");')
->getBody();
$json = json_decode(stripslashes($html), true);
if (isset($json['detail']['post_data']['multi'])) {
$multi = $json['detail']['post_data']['multi'];
foreach ($multi as $key => $value) {
$retlist[$key] = $value['path'];
}
}
return $retlist;
}
$param = [
'url' => 'https://bcy.net/apiv3/common/circleFeed?circle_id=492',
];
$retlist = [];
$result = Whttp::get($param['url']);
if($result->getCode() != 200) {
print("请求错误"."\n");
} else {
// 记录请求耗时起点
Debug::remark('begin');
$getlist = $result->getJson('data.items');
foreach ($getlist as $key => $value) {
$retlist[$key]['id'] = $value['item_detail']['item_id'];
$retlist[$key]['plain'] = $value['item_detail']['plain'];
$retlist[$key]['img'] = getImg($value['item_detail']['item_id']);
}
// 下载图片
foreach ($retlist as $key => $value) {
print("id:{$value['id']}"."\n");
print("plain:{$value['plain']}"."\n");
$img = Whttp::get($value['img'])
->savepath("./bcy_img/".$value['id'])
->concurrent(10) // 同时并发任务数量
->gany(Function($data)
{
$error = $data['error'];
$url = $data['info']['url'];
if(empty($error)) {
print("下载成功:{$url}"."\n");
} else {
print("下载失败:{$url}"."\n");
}
});
$img->getDownload();
print("=================我是分隔符================="."\n");
}
}
// 请求耗时终点
Debug::remark('end');
// 计算请求耗时
$cost = floatval(Debug::getRangeTime('begin','end',4));
// 计算内存消耗
$memory = strtolower(Debug::getRangeMem('begin','end'));
print("[耗时:".$cost."秒] [内存消耗:".$memory."]");
|