PHP 多线程爬取天涯帖子
使用了querylist框架来解析dom
话不多说,上代码
```
<?php
namespace app\command\crawler;
use GuzzleHttp\Psr7\Response;
use QL\QueryList;
class Tianya
{
protected $url = "";// 帖子地址
protected $title;
protected $author;
protected $urls = [];
protected $mainPosts;
protected $posts;
public function execute(){
$html = QueryList::get($this->url);
$this->title = $html->find(".s_title")->text();
$this->author = $this->handlerSymbol($html->find(".atl-info:eq(0)")->text());
$pages = $html->find(".atl-pages:eq(0) a")->texts()->filter(function ($item){
return is_numeric($item);
})->all();
if(!empty($pages)){
$maxPage =max($pages);
$this->getUrls($maxPage);
}
$this->mainPosts = $html->rules([
'author' => [".atl-info","text"],
'content' => [".bbs-content","text"],
'comments' => [".ir-list li","texts","-.ir-power -.ir-reply"],
])->range(".atl-item")->query()->getData()->all();
$this->writeFile();
$this->crawlOtherPage();
}
protected function getUrls($maxPage){
if(is_numeric($maxPage)){
for ($i=2;$i<=$maxPage;$i++){
$this->urls[] =str_replace("-1.shtml", "-{$i}.shtml", $this->url);
}
}
}
protected function writeFile(){
$file = "{$this->title}.txt";
if(is_file($file)){
unlink($file);
}
$handler = fopen($file, "a+");
fwrite($handler,$this->title.PHP_EOL);
fwrite($handler,$this->author.PHP_EOL);
foreach ($this->mainPosts as $item){
fwrite($handler,$this->writeString($item['author']));
fwrite($handler,$this->writeString($item['content']));
fwrite($handler,$this->writeComment($item['comments']));
}
fclose($handler);
}
protected function writePosts(){
$file = "{$this->title}.txt";
$handler = fopen($file, "a+");
sort($this->posts);
foreach ($this->posts as $page){
foreach ($page as $item){
fwrite($handler,$this->writeString($item['author']));
fwrite($handler,$this->writeString($item['content']));
fwrite($handler,$this->writeComment($item['comments']));
}
}
fclose($handler);
}
protected function crawlOtherPage(){
if(!empty($this->urls)){
$rules = [
'author' => ['.atl-info','text'],
'content' => ['.bbs-content','text'],
'comments' => [".ir-list li","texts","-.ir-power -.ir-reply"],
];
$range = '.atl-main .atl-item';
QueryList::rules($rules)
->range($range)
->multiGet($this->urls)
// 设置并发数为5
->concurrency(5)
// 设置GuzzleHttp的一些其他选项
->withOptions([
'timeout' => 60
])
// 设置HTTP Header
->withHeaders([
'User-Agent' => 'QueryList'
])
// HTTP success回调函数
->success(function (QueryList $ql, Response $response, $index){
$data = $ql->queryData();
$this->posts[$index] = $data;
})
// HTTP error回调函数
->error(function (QueryList $ql, $reason, $index){
// ...
})
->send();
$this->writePosts();
}
}
protected function handlerSymbol($string){
return preg_replace('/\s+/', ' ',$string);
}
protected function writeString($string){
return $this->handlerSymbol($string).PHP_EOL;
}
protected function writeComment($comments){
$string = "";
if($comments){
$string.="评论:".PHP_EOL;
}
if(!empty($comments) && is_array($comments)){
foreach ($commentsas $comment){
$string .= " ".$this->handlerSymbol($comment).PHP_EOL;
}
}
$string.="\n\n\n\n\n";
return $string;
}
}
``` QueryList 库不好用 而且很久没更新了 素问何问 发表于 2022-12-19 18:03
QueryList 库不好用 而且很久没更新了
还行,我觉得算是php里面最好用的一款爬虫框架了 楼主厉害,学习了 看到这个是不是和抓取是一个意思呢、、{:301_1003:}
我抓了一个网站 直接打不开了{:301_1002:}
可有办法解决{:301_985:} alongzhenggang 发表于 2022-12-19 19:23
看到这个是不是和抓取是一个意思呢、、
我抓了一个网站 直接打不开了
可能ip被封了吧,换个ip再试试 php写爬虫台鸡肋了,基本没什么插件,爬虫还是py好用 多谢分享,学习了。
多谢分享,学习了。
页:
[1]