使用了querylist框架来解析dom
话不多说,上代码
<?php
namespace app\command\crawler;
use GuzzleHttp\Psr7\Response;
use QL\QueryList;
class Tianya
{
protected $url = "";// 帖子地址
protected $title;
protected $author;
protected $urls = [];
protected $mainPosts;
protected $posts;
public function execute(){
$html = QueryList::get($this->url);
$this->title = $html->find(".s_title")->text();
$this->author = $this->handlerSymbol($html->find(".atl-info:eq(0)")->text());
$pages = $html->find(".atl-pages:eq(0) a")->texts()->filter(function ($item){
return is_numeric($item);
})->all();
if(!empty($pages)){
$maxPage = max($pages);
$this->getUrls($maxPage);
}
$this->mainPosts = $html->rules([
'author' => [".atl-info","text"],
'content' => [".bbs-content","text"],
'comments' => [".ir-list li","texts","-.ir-power -.ir-reply"],
])->range(".atl-item")->query()->getData()->all();
$this->writeFile();
$this->crawlOtherPage();
}
protected function getUrls($maxPage){
if(is_numeric($maxPage)){
for ($i=2;$i<=$maxPage;$i++){
$this->urls[] = str_replace("-1.shtml", "-{$i}.shtml", $this->url);
}
}
}
protected function writeFile(){
$file = "{$this->title}.txt";
if(is_file($file)){
unlink($file);
}
$handler = fopen($file, "a+");
fwrite($handler,$this->title.PHP_EOL);
fwrite($handler,$this->author.PHP_EOL);
foreach ($this->mainPosts as $item){
fwrite($handler,$this->writeString($item['author']));
fwrite($handler,$this->writeString($item['content']));
fwrite($handler,$this->writeComment($item['comments']));
}
fclose($handler);
}
protected function writePosts(){
$file = "{$this->title}.txt";
$handler = fopen($file, "a+");
sort($this->posts);
foreach ($this->posts as $page){
foreach ($page as $item){
fwrite($handler,$this->writeString($item['author']));
fwrite($handler,$this->writeString($item['content']));
fwrite($handler,$this->writeComment($item['comments']));
}
}
fclose($handler);
}
protected function crawlOtherPage(){
if(!empty($this->urls)){
$rules = [
'author' => ['.atl-info','text'],
'content' => ['.bbs-content','text'],
'comments' => [".ir-list li","texts","-.ir-power -.ir-reply"],
];
$range = '.atl-main .atl-item';
QueryList::rules($rules)
->range($range)
->multiGet($this->urls)
// 设置并发数为5
->concurrency(5)
// 设置GuzzleHttp的一些其他选项
->withOptions([
'timeout' => 60
])
// 设置HTTP Header
->withHeaders([
'User-Agent' => 'QueryList'
])
// HTTP success回调函数
->success(function (QueryList $ql, Response $response, $index){
$data = $ql->queryData();
$this->posts[$index] = $data;
})
// HTTP error回调函数
->error(function (QueryList $ql, $reason, $index){
// ...
})
->send();
$this->writePosts();
}
}
protected function handlerSymbol($string){
return preg_replace('/\s+/', ' ',$string);
}
protected function writeString($string){
return $this->handlerSymbol($string).PHP_EOL;
}
protected function writeComment($comments){
$string = "";
if($comments){
$string.="评论:".PHP_EOL;
}
if(!empty($comments) && is_array($comments)){
foreach ($comments as $comment){
$string .= " ".$this->handlerSymbol($comment).PHP_EOL;
}
}
$string.="\n\n\n\n\n";
return $string;
}
}
|